bakery-ia/services/data/app/external/processors/madrid_processor.py

# ================================================================
# services/data/app/external/processors/madrid_processor.py
# ================================================================
"""
Data transformation and parsing for Madrid traffic data
Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring
"""

import csv
import io
import math
import re
import xml.etree.ElementTree as ET
import zipfile
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional, Tuple
import structlog
import pyproj

from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel


class MadridTrafficDataProcessor:
    """Handles all data transformation and parsing for Madrid traffic data"""

    def __init__(self):
        self.logger = structlog.get_logger()
        # UTM Zone 30N (Madrid's coordinate system)
        self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84')
        self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')

    def safe_int(self, value: str) -> int:
        """Safely convert string to int"""
        try:
            return int(float(value.replace(',', '.')))
        except (ValueError, TypeError):
            return 0

    def _safe_float(self, value: str) -> float:
        """Safely convert string to float"""
        try:
            return float(value.replace(',', '.'))
        except (ValueError, TypeError):
            return 0.0

    def clean_madrid_xml(self, xml_content: str) -> str:
        """Clean and prepare Madrid XML content for parsing"""
        if not xml_content:
            return ""

        # Remove BOM and extra whitespace
        cleaned = xml_content.strip()
        if cleaned.startswith('\ufeff'):
            cleaned = cleaned[1:]

        # Fix common XML issues
        cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&amp;', cleaned)

        # Ensure proper encoding declaration
        if not cleaned.startswith('<?xml'):
            cleaned = '<?xml version="1.0" encoding="UTF-8"?>\n' + cleaned

        return cleaned

    def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]:
        """Convert UTM coordinates to latitude/longitude"""
        try:
            utm_x_float = float(utm_x.replace(',', '.'))
            utm_y_float = float(utm_y.replace(',', '.'))

            # Convert from UTM Zone 30N to WGS84
            longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float)

            # Validate coordinates are in Madrid area
            if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5:
                return latitude, longitude
            else:
                self.logger.debug("Coordinates outside Madrid bounds",
                                lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y)
                return None, None

        except Exception as e:
            self.logger.debug("UTM conversion error",
                            utm_x=utm_x, utm_y=utm_y, error=str(e))
            return None, None

    def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
        """Parse Madrid traffic XML data"""
        traffic_points = []

        try:
            cleaned_xml = self.clean_madrid_xml(xml_content)
            root = ET.fromstring(cleaned_xml)

            self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root)))

            if root.tag == 'pms':
                pm_elements = root.findall('pm')
                self.logger.debug("Found PM elements", count=len(pm_elements))

                for pm in pm_elements:
                    try:
                        traffic_point = self._extract_madrid_pm_element(pm)

                        if self._is_valid_traffic_point(traffic_point):
                            traffic_points.append(traffic_point)

                            # Log first few points for debugging
                            if len(traffic_points) <= 3:
                                self.logger.debug("Sample traffic point",
                                               id=traffic_point['idelem'],
                                               lat=traffic_point['latitude'],
                                               lon=traffic_point['longitude'],
                                               intensity=traffic_point.get('intensidad'))

                    except Exception as e:
                        self.logger.debug("Error parsing PM element", error=str(e))
                        continue
            else:
                self.logger.warning("Unexpected XML root tag", root_tag=root.tag)

            self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
            return traffic_points

        except ET.ParseError as e:
            self.logger.warning("Failed to parse Madrid XML", error=str(e))
            return self._extract_traffic_data_regex(xml_content)
        except Exception as e:
            self.logger.error("Error in Madrid traffic XML parsing", error=str(e))
            return []

    def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
        """Extract traffic data from Madrid <pm> element with coordinate conversion"""
        try:
            point_data = {}
            utm_x = utm_y = None

            # Extract all child elements
            for child in pm_element:
                tag, text = child.tag, child.text.strip() if child.text else ''

                if tag == 'idelem':
                    point_data['idelem'] = text
                elif tag == 'descripcion':
                    point_data['descripcion'] = text
                elif tag == 'intensidad':
                    point_data['intensidad'] = self.safe_int(text)
                elif tag == 'ocupacion':
                    point_data['ocupacion'] = self._safe_float(text)
                elif tag == 'carga':
                    point_data['carga'] = self.safe_int(text)
                elif tag == 'nivelServicio':
                    point_data['nivelServicio'] = self.safe_int(text)
                elif tag == 'st_x':  # UTM X coordinate
                    utm_x = text
                    point_data['utm_x'] = text
                elif tag == 'st_y':  # UTM Y coordinate
                    utm_y = text
                    point_data['utm_y'] = text
                elif tag == 'error':
                    point_data['error'] = text
                elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']:
                    point_data[tag] = text

            # Convert coordinates
            if utm_x and utm_y:
                latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y)

                if latitude and longitude:
                    point_data.update({
                        'latitude': latitude,
                        'longitude': longitude,
                        'measurement_point_id': point_data.get('idelem'),
                        'measurement_point_name': point_data.get('descripcion'),
                        'timestamp': datetime.now(timezone.utc),
                        'source': 'madrid_opendata_xml'
                    })

                    return point_data
                else:
                    self.logger.debug("Invalid coordinates after conversion",
                                   idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y)
                    return {}
            else:
                self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem'))
                return {}

        except Exception as e:
            self.logger.debug("Error extracting PM element", error=str(e))
            return {}

    def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool:
        """Validate traffic point data"""
        required_fields = ['idelem', 'latitude', 'longitude']
        return all(field in traffic_point and traffic_point[field] for field in required_fields)

    def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
        """Fallback regex-based extraction if XML parsing fails"""
        traffic_points = []

        try:
            # Pattern to match PM elements
            pm_pattern = r'<pm>(.*?)</pm>'
            pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)

            for pm_content in pm_matches:
                traffic_point = {}

                # Extract key fields
                patterns = {
                    'idelem': r'<idelem>(.*?)</idelem>',
                    'descripcion': r'<descripcion>(.*?)</descripcion>',
                    'intensidad': r'<intensidad>(.*?)</intensidad>',
                    'ocupacion': r'<ocupacion>(.*?)</ocupacion>',
                    'st_x': r'<st_x>(.*?)</st_x>',
                    'st_y': r'<st_y>(.*?)</st_y>'
                }

                for field, pattern in patterns.items():
                    match = re.search(pattern, pm_content)
                    if match:
                        traffic_point[field] = match.group(1).strip()

                # Convert coordinates
                if 'st_x' in traffic_point and 'st_y' in traffic_point:
                    latitude, longitude = self.convert_utm_to_latlon(
                        traffic_point['st_x'], traffic_point['st_y']
                    )

                    if latitude and longitude:
                        traffic_point.update({
                            'latitude': latitude,
                            'longitude': longitude,
                            'intensidad': self.safe_int(traffic_point.get('intensidad', '0')),
                            'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')),
                            'measurement_point_id': traffic_point.get('idelem'),
                            'measurement_point_name': traffic_point.get('descripcion'),
                            'timestamp': datetime.now(timezone.utc),
                            'source': 'madrid_opendata_xml_regex'
                        })

                        traffic_points.append(traffic_point)

            self.logger.debug("Regex extraction completed", points=len(traffic_points))
            return traffic_points

        except Exception as e:
            self.logger.error("Error in regex extraction", error=str(e))
            return []

    def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]:
        """Parse measurement points CSV into lookup dictionary"""
        measurement_points = {}

        try:
            # Parse CSV with semicolon delimiter
            csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';')

            processed_count = 0
            for row in csv_reader:
                try:
                    # Extract point ID and coordinates
                    point_id = row.get('id', '').strip()
                    if not point_id:
                        continue

                    processed_count += 1

                    # Try different coordinate field names
                    lat_str = ''
                    lon_str = ''

                    # Common coordinate field patterns
                    lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y']
                    lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x']

                    for field in lat_fields:
                        if field in row and row[field].strip():
                            lat_str = row[field].strip()
                            break

                    for field in lon_fields:
                        if field in row and row[field].strip():
                            lon_str = row[field].strip()
                            break

                    if lat_str and lon_str:
                        try:
                            # Try direct lat/lon first
                            latitude = self._safe_float(lat_str)
                            longitude = self._safe_float(lon_str)

                            # If values look like UTM coordinates, convert them
                            if latitude > 1000 or longitude > 1000:
                                latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str)
                                if not latitude or not longitude:
                                    continue

                            # Validate Madrid area
                            if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5):
                                continue

                            measurement_points[point_id] = {
                                'id': point_id,
                                'latitude': latitude,
                                'longitude': longitude,
                                'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")),
                                'type': row.get('tipo', 'traffic'),
                                'raw_data': dict(row)  # Keep original data
                            }

                        except Exception as e:
                            self.logger.debug("Error processing point coordinates",
                                            point_id=point_id, error=str(e))
                            continue

                except Exception as e:
                    self.logger.debug("Error processing CSV row", error=str(e))
                    continue

            self.logger.info("Parsed measurement points registry",
                           total_points=len(measurement_points))
            return measurement_points

        except Exception as e:
            self.logger.error("Error parsing measurement points CSV", error=str(e))
            return {}

    def calculate_data_quality_score(self, row: Dict[str, str]) -> float:
        """Calculate data quality score for a traffic record"""
        try:
            score = 1.0

            # Check for missing or invalid values
            intensidad = row.get('intensidad', '').strip()
            if not intensidad or intensidad in ['N', '', '0']:
                score *= 0.7

            ocupacion = row.get('ocupacion', '').strip()
            if not ocupacion or ocupacion in ['N', '', '0']:
                score *= 0.8

            error_status = row.get('error', '').strip()
            if error_status and error_status != 'N':
                score *= 0.6

            # Check for reasonable value ranges
            try:
                intensidad_val = self.safe_int(intensidad)
                if intensidad_val < 0 or intensidad_val > 5000:  # Unrealistic traffic volume
                    score *= 0.7

                ocupacion_val = self.safe_int(ocupacion)
                if ocupacion_val < 0 or ocupacion_val > 100:  # Invalid percentage
                    score *= 0.5

            except:
                score *= 0.6

            return max(0.1, score)  # Minimum quality score

        except Exception as e:
            self.logger.debug("Error calculating quality score", error=str(e))
            return 0.5  # Default medium quality

    async def process_csv_content_chunked(self, text_content: str, csv_filename: str,
                                        nearest_ids: set, nearest_points: list) -> list:
        """Process CSV content in chunks to prevent memory issues"""
        import csv
        import io
        import gc

        try:
            csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';')

            chunk_size = 10000
            chunk_records = []
            all_records = []
            processed_count = 0
            total_rows_seen = 0

            for row in csv_reader:
                total_rows_seen += 1
                measurement_point_id = row.get('id', '').strip()

                if measurement_point_id not in nearest_ids:
                    continue

                try:
                    record_data = await self.parse_historical_csv_row(row, nearest_points)

                    if record_data:
                        chunk_records.append(record_data)
                        processed_count += 1

                        if len(chunk_records) >= chunk_size:
                            all_records.extend(chunk_records)
                            chunk_records = []
                            gc.collect()

                except Exception as e:
                    if processed_count < 5:
                        self.logger.error("Row parsing exception",
                                        row_num=total_rows_seen,
                                        measurement_point_id=measurement_point_id,
                                        error=str(e))
                    continue

            # Process remaining records
            if chunk_records:
                all_records.extend(chunk_records)
                chunk_records = []
                gc.collect()

            self.logger.info("Processed CSV file",
                           filename=csv_filename,
                           total_rows_read=total_rows_seen,
                           processed_records=processed_count)

            return all_records

        except Exception as e:
            self.logger.error("Error processing CSV content",
                            filename=csv_filename, error=str(e))
            return []

    async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict:
        """Parse a single row from Madrid's historical traffic CSV"""
        try:
            # Extract date
            fecha_str = row.get('fecha', '').strip()
            if not fecha_str:
                return None

            try:
                from datetime import datetime, timezone
                date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S')
                date_obj = date_obj.replace(tzinfo=timezone.utc)
            except Exception:
                return None

            measurement_point_id = row.get('id', '').strip()

            # Find point data
            point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None)
            if not point_match:
                return None

            point_data = point_match[1]
            distance_km = point_match[2]

            # Extract traffic data
            intensidad = self.safe_int(row.get('intensidad', '0'))
            ocupacion = self.safe_int(row.get('ocupacion', '0'))
            carga = self.safe_int(row.get('carga', '0'))
            vmed = self.safe_int(row.get('vmed', '0'))

            # Build basic result (business logic will be applied elsewhere)
            result = {
                'date': date_obj,
                'measurement_point_id': measurement_point_id,
                'point_data': point_data,
                'distance_km': distance_km,
                'traffic_data': {
                    'intensidad': intensidad,
                    'ocupacion': ocupacion,
                    'carga': carga,
                    'vmed': vmed
                },
                'data_quality_score': self.calculate_data_quality_score(row),
                'raw_row': row
            }

            return result

        except Exception as e:
            self.logger.debug("Error parsing historical CSV row", error=str(e))
            return None