Refactor the traffic fetching system

2025-08-10 18:32:47 +02:00
parent 3c2acc934a
commit 8d125ab0d5
10 changed files with 1356 additions and 1574 deletions
--- a/services/data/app/external/apis/madrid_traffic_client.py
+++ b/services/data/app/external/apis/madrid_traffic_client.py
--- a/services/data/app/external/clients/init.py
+++ b/services/data/app/external/clients/init.py
@@ -0,0 +1,12 @@
 # ================================================================
 # services/data/app/external/clients/__init__.py
 # ================================================================
 """
 HTTP clients package
 """
 from .madrid_client import MadridTrafficAPIClient
 __all__ = [
    'MadridTrafficAPIClient'
 ]
--- a/services/data/app/external/clients/madrid_client.py
+++ b/services/data/app/external/clients/madrid_client.py
@@ -0,0 +1,155 @@
 # ================================================================
 # services/data/app/external/clients/madrid_client.py
 # ================================================================
 """
 Pure HTTP client for Madrid traffic APIs
 Handles only HTTP communication and response decoding
 """
 import httpx
 import structlog
 from datetime import datetime
 from typing import Optional, Dict, Any
 from ..base_client import BaseAPIClient
 class MadridTrafficAPIClient(BaseAPIClient):
    """Pure HTTP client for Madrid traffic APIs"""
    TRAFFIC_ENDPOINT = "https://datos.madrid.es/egob/catalogo/202468-10-intensidad-trafico.xml"
    MEASUREMENT_POINTS_URL = "https://datos.madrid.es/egob/catalogo/202468-263-intensidad-trafico.csv"
    def __init__(self):
        super().__init__(base_url="https://datos.madrid.es")
        self.logger = structlog.get_logger()
    def _decode_response_content(self, response) -> Optional[str]:
        """Decode response content with multiple encoding attempts"""
        try:
            return response.text
        except UnicodeDecodeError:
            # Try manual encoding for Spanish content
            for encoding in ['utf-8', 'latin-1', 'windows-1252', 'iso-8859-1']:
                try:
                    content = response.content.decode(encoding)
                    if content and len(content) > 100:
                        self.logger.debug("Successfully decoded with encoding", encoding=encoding)
                        return content
                except UnicodeDecodeError:
                    continue
        return None
    def _build_historical_url(self, year: int, month: int) -> str:
        """Build historical ZIP URL for given year and month"""
        # Madrid historical data URL pattern
        base_url = "https://datos.madrid.es/egob/catalogo/208627"
        # URL numbering pattern (this may need adjustment based on actual URLs)
        if year == 2023:
            url_number = 116 + (month - 1)  # 116-127 for 2023
        elif year == 2024:
            url_number = 128 + (month - 1)  # 128-139 for 2024
        else:
            url_number = 116  # Fallback
        return f"{base_url}-{url_number}-transporte-ptomedida-historico.zip"
    async def fetch_current_traffic_xml(self, endpoint: Optional[str] = None) -> Optional[str]:
        """Fetch current traffic XML data"""
        endpoint = endpoint or self.TRAFFIC_ENDPOINT
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': 'application/xml,text/xml,*/*',
                'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Cache-Control': 'no-cache',
                'Referer': 'https://datos.madrid.es/'
            }
            response = await self.get(endpoint, headers=headers, timeout=30)
            if not response or response.status_code != 200:
                self.logger.warning("Failed to fetch XML data", 
                                  endpoint=endpoint, 
                                  status=response.status_code if response else None)
                return None
            # Get XML content with encoding handling
            xml_content = self._decode_response_content(response)
            if not xml_content:
                self.logger.debug("No XML content received", endpoint=endpoint)
                return None
            self.logger.debug("Madrid XML content fetched", 
                            length=len(xml_content),
                            endpoint=endpoint)
            return xml_content
        except Exception as e:
            self.logger.error("Error fetching traffic XML data", 
                            endpoint=endpoint, 
                            error=str(e))
            return None
    async def fetch_measurement_points_csv(self, url: Optional[str] = None) -> Optional[str]:
        """Fetch measurement points CSV data"""
        url = url or self.MEASUREMENT_POINTS_URL
        try:
            async with httpx.AsyncClient(
                timeout=30.0,
                headers={
                    'User-Agent': 'MadridTrafficClient/2.0',
                    'Accept': 'text/csv,application/csv,*/*'
                },
                follow_redirects=True
            ) as client:
                self.logger.debug("Fetching measurement points registry", url=url)
                response = await client.get(url)
                if response.status_code == 200:
                    return response.text
                else:
                    self.logger.warning("Failed to fetch measurement points", 
                                      status=response.status_code, url=url)
                    return None
        except Exception as e:
            self.logger.error("Error fetching measurement points registry", 
                            url=url, error=str(e))
            return None
    async def fetch_historical_zip(self, zip_url: str) -> Optional[bytes]:
        """Fetch historical traffic ZIP file"""
        try:
            async with httpx.AsyncClient(
                timeout=120.0,  # Longer timeout for large files
                headers={
                    'User-Agent': 'MadridTrafficClient/2.0',
                    'Accept': 'application/zip,*/*'
                },
                follow_redirects=True
            ) as client:
                self.logger.debug("Fetching historical ZIP", url=zip_url)
                response = await client.get(zip_url)
                if response.status_code == 200:
                    self.logger.debug("Historical ZIP fetched", 
                                    url=zip_url,
                                    size=len(response.content))
                    return response.content
                else:
                    self.logger.warning("Failed to fetch historical ZIP", 
                                      status=response.status_code, url=zip_url)
                    return None
        except Exception as e:
            self.logger.error("Error fetching historical ZIP", 
                            url=zip_url, error=str(e))
            return None
--- a/services/data/app/external/models/init.py
+++ b/services/data/app/external/models/init.py
@@ -0,0 +1,20 @@
 # ================================================================
 # services/data/app/external/models/__init__.py
 # ================================================================
 """
 Madrid traffic models package
 """
 from .madrid_models import (
    TrafficServiceLevel,
    CongestionLevel,
    MeasurementPoint,
    TrafficRecord
 )
 __all__ = [
    'TrafficServiceLevel',
    'CongestionLevel', 
    'MeasurementPoint',
    'TrafficRecord'
 ]
--- a/services/data/app/external/models/madrid_models.py
+++ b/services/data/app/external/models/madrid_models.py
@@ -0,0 +1,66 @@
 # ================================================================
 # services/data/app/external/models/madrid_models.py
 # ================================================================
 """
 Data structures, enums, and dataclasses for Madrid traffic system
 """
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from typing import Optional
 class TrafficServiceLevel(Enum):
    """Madrid traffic service levels"""
    FLUID = 0
    DENSE = 1
    CONGESTED = 2
    BLOCKED = 3
 class CongestionLevel(Enum):
    """Standardized congestion levels"""
    LOW = "low"
    MEDIUM = "medium" 
    HIGH = "high"
    BLOCKED = "blocked"
@dataclass
 class MeasurementPoint:
    """Madrid measurement point data structure"""
    id: str
    latitude: float
    longitude: float
    distance: float
    name: str
    type: str
@dataclass
 class TrafficRecord:
    """Standardized traffic record with pedestrian inference"""
    date: datetime
    traffic_volume: int
    occupation_percentage: int
    load_percentage: int
    average_speed: int
    congestion_level: str
    pedestrian_count: int
    measurement_point_id: str
    measurement_point_name: str
    road_type: str
    source: str
    district: Optional[str] = None
    # Madrid-specific data
    intensidad_raw: Optional[int] = None
    ocupacion_raw: Optional[int] = None
    carga_raw: Optional[int] = None
    vmed_raw: Optional[int] = None
    # Pedestrian inference metadata
    pedestrian_multiplier: Optional[float] = None
    time_pattern_factor: Optional[float] = None
    district_factor: Optional[float] = None
--- a/services/data/app/external/processors/init.py
+++ b/services/data/app/external/processors/init.py
@@ -0,0 +1,14 @@
 # ================================================================
 # services/data/app/external/processors/__init__.py
 # ================================================================
 """
 Data processors package
 """
 from .madrid_processor import MadridTrafficDataProcessor
 from .madrid_business_logic import MadridTrafficAnalyzer
 __all__ = [
    'MadridTrafficDataProcessor',
    'MadridTrafficAnalyzer'
 ]
--- a/services/data/app/external/processors/madrid_business_logic.py
+++ b/services/data/app/external/processors/madrid_business_logic.py
@@ -0,0 +1,346 @@
 # ================================================================
 # services/data/app/external/processors/madrid_business_logic.py
 # ================================================================
 """
 Business rules, inference, and domain logic for Madrid traffic data
 Handles pedestrian inference, district mapping, road classification, and validation
 """
 import math
 import re
 from datetime import datetime
 from typing import Dict, List, Any, Optional, Tuple
 import structlog
 from ..models.madrid_models import TrafficRecord, CongestionLevel
 class MadridTrafficAnalyzer:
    """Handles business logic for Madrid traffic analysis"""
    # Madrid district characteristics for pedestrian patterns
    DISTRICT_MULTIPLIERS = {
        'Centro': 2.5,      # Historic center, high pedestrian activity
        'Salamanca': 2.0,   # Shopping area, high foot traffic
        'Chamberí': 1.8,    # Business district
        'Retiro': 2.2,      # Near park, high leisure activity
        'Chamartín': 1.6,   # Business/residential
        'Tetuán': 1.4,      # Mixed residential/commercial
        'Fuencarral': 1.3,  # Residential with commercial areas
        'Moncloa': 1.7,     # University area
        'Latina': 1.5,      # Residential area
        'Carabanchel': 1.2, # Residential periphery
        'Usera': 1.1,       # Industrial/residential
        'Villaverde': 1.0,  # Industrial area
        'Villa de Vallecas': 1.0,  # Peripheral residential
        'Vicálvaro': 0.9,   # Peripheral
        'San Blas': 1.1,    # Residential
        'Barajas': 0.8,     # Airport area, low pedestrian activity
        'Hortaleza': 1.2,   # Mixed area
        'Ciudad Lineal': 1.3, # Linear development
        'Puente de Vallecas': 1.2, # Working class area
        'Moratalaz': 1.1,   # Residential
        'Arganzuela': 1.6,  # Near center, growing area
    }
    # Time-based patterns (hour of day)
    TIME_PATTERNS = {
        'morning_peak': {'hours': [7, 8, 9], 'multiplier': 2.0},
        'lunch_peak': {'hours': [12, 13, 14], 'multiplier': 2.5},
        'evening_peak': {'hours': [18, 19, 20], 'multiplier': 2.2},
        'afternoon': {'hours': [15, 16, 17], 'multiplier': 1.8},
        'late_evening': {'hours': [21, 22], 'multiplier': 1.5},
        'night': {'hours': [23, 0, 1, 2, 3, 4, 5, 6], 'multiplier': 0.3},
        'morning': {'hours': [10, 11], 'multiplier': 1.4}
    }
    # Road type specific patterns
    ROAD_TYPE_BASE = {
        'URB': 250,    # Urban streets - high pedestrian activity
        'M30': 50,     # Ring road - minimal pedestrians
        'C30': 75,     # Secondary ring - some pedestrian access
        'A': 25,       # Highways - very low pedestrians
        'R': 40        # Radial roads - low to moderate
    }
    # Weather impact on pedestrian activity
    WEATHER_IMPACT = {
        'rain': 0.6,        # 40% reduction in rain
        'hot_weather': 0.8, # 20% reduction when very hot
        'cold_weather': 0.7, # 30% reduction when very cold
        'normal': 1.0       # No impact
    }
    def __init__(self):
        self.logger = structlog.get_logger()
    def calculate_pedestrian_flow(
        self, 
        traffic_record: TrafficRecord,
        location_context: Optional[Dict[str, Any]] = None
    ) -> Tuple[int, Dict[str, float]]:
        """
        Calculate pedestrian flow estimate with detailed metadata
        Returns:
            Tuple of (pedestrian_count, inference_metadata)
        """
        # Base calculation from road type
        road_type = traffic_record.road_type or 'URB'
        base_pedestrians = self.ROAD_TYPE_BASE.get(road_type, 200)
        # Time pattern adjustment
        hour = traffic_record.date.hour
        time_factor = self._get_time_pattern_factor(hour)
        # District adjustment (if available)
        district_factor = 1.0
        district = traffic_record.district or self.infer_district_from_location(location_context)
        if district:
            district_factor = self.DISTRICT_MULTIPLIERS.get(district, 1.0)
        # Traffic correlation adjustment
        traffic_factor = self._calculate_traffic_correlation(traffic_record)
        # Weather adjustment (if data available)
        weather_factor = self._get_weather_factor(traffic_record.date, location_context)
        # Weekend adjustment
        weekend_factor = self._get_weekend_factor(traffic_record.date)
        # Combined calculation
        pedestrian_count = int(
            base_pedestrians * 
            time_factor * 
            district_factor * 
            traffic_factor * 
            weather_factor * 
            weekend_factor
        )
        # Ensure reasonable bounds
        pedestrian_count = max(10, min(2000, pedestrian_count))
        # Metadata for model training
        inference_metadata = {
            'base_pedestrians': base_pedestrians,
            'time_factor': time_factor,
            'district_factor': district_factor,
            'traffic_factor': traffic_factor,
            'weather_factor': weather_factor,
            'weekend_factor': weekend_factor,
            'inferred_district': district,
            'hour': hour,
            'road_type': road_type
        }
        return pedestrian_count, inference_metadata
    def _get_time_pattern_factor(self, hour: int) -> float:
        """Get time-based pedestrian activity multiplier"""
        for pattern, config in self.TIME_PATTERNS.items():
            if hour in config['hours']:
                return config['multiplier']
        return 1.0  # Default multiplier
    def _calculate_traffic_correlation(self, traffic_record: TrafficRecord) -> float:
        """
        Calculate pedestrian correlation with traffic patterns
        Higher traffic in urban areas often correlates with more pedestrians
        """
        if traffic_record.road_type == 'URB':
            # Urban areas: moderate traffic indicates commercial activity
            if 30 <= traffic_record.load_percentage <= 70:
                return 1.3  # Sweet spot for pedestrian activity
            elif traffic_record.load_percentage > 70:
                return 0.9  # Too congested, pedestrians avoid
            else:
                return 1.0  # Normal correlation
        else:
            # Highway/ring roads: more traffic = fewer pedestrians
            if traffic_record.load_percentage > 60:
                return 0.5
            else:
                return 0.8
    def _get_weather_factor(self, date: datetime, location_context: Optional[Dict] = None) -> float:
        """Estimate weather impact on pedestrian activity"""
        # Simplified weather inference based on season and typical Madrid patterns
        month = date.month
        # Madrid seasonal patterns
        if month in [12, 1, 2]:  # Winter - cold weather impact
            return self.WEATHER_IMPACT['cold_weather']
        elif month in [7, 8]:    # Summer - hot weather impact
            return self.WEATHER_IMPACT['hot_weather']
        elif month in [10, 11, 3, 4]:  # Rainy seasons - moderate impact
            return 0.85
        else:  # Spring/early summer - optimal weather
            return 1.1
    def _get_weekend_factor(self, date: datetime) -> float:
        """Weekend vs weekday pedestrian patterns"""
        weekday = date.weekday()
        hour = date.hour
        if weekday >= 5:  # Weekend
            if 11 <= hour <= 16:  # Weekend shopping/leisure hours
                return 1.4
            elif 20 <= hour <= 23:  # Weekend evening activity
                return 1.3
            else:
                return 0.9
        else:  # Weekday
            return 1.0
    def infer_district_from_location(self, location_context: Optional[Dict] = None) -> Optional[str]:
        """
        Infer Madrid district from location context or coordinates
        """
        if not location_context:
            return None
        lat = location_context.get('latitude')
        lon = location_context.get('longitude')
        if not (lat and lon):
            return None
        # Madrid district boundaries (simplified boundaries for inference)
        districts = {
            # Central districts
            'Centro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.720, 'lon_max': -3.690},
            'Arganzuela': {'lat_min': 40.385, 'lat_max': 40.410, 'lon_min': -3.720, 'lon_max': -3.680},
            'Retiro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.690, 'lon_max': -3.660},
            'Salamanca': {'lat_min': 40.420, 'lat_max': 40.445, 'lon_min': -3.690, 'lon_max': -3.660},
            'Chamartín': {'lat_min': 40.445, 'lat_max': 40.480, 'lon_min': -3.690, 'lon_max': -3.660},
            'Tetuán': {'lat_min': 40.445, 'lat_max': 40.470, 'lon_min': -3.720, 'lon_max': -3.690},
            'Chamberí': {'lat_min': 40.425, 'lat_max': 40.450, 'lon_min': -3.720, 'lon_max': -3.690},
            'Fuencarral-El Pardo': {'lat_min': 40.470, 'lat_max': 40.540, 'lon_min': -3.750, 'lon_max': -3.650},
            'Moncloa-Aravaca': {'lat_min': 40.430, 'lat_max': 40.480, 'lon_min': -3.750, 'lon_max': -3.720},
            'Latina': {'lat_min': 40.380, 'lat_max': 40.420, 'lon_min': -3.750, 'lon_max': -3.720},
            'Carabanchel': {'lat_min': 40.350, 'lat_max': 40.390, 'lon_min': -3.750, 'lon_max': -3.720},
            'Usera': {'lat_min': 40.350, 'lat_max': 40.385, 'lon_min': -3.720, 'lon_max': -3.690},
            'Puente de Vallecas': {'lat_min': 40.370, 'lat_max': 40.410, 'lon_min': -3.680, 'lon_max': -3.640},
            'Moratalaz': {'lat_min': 40.400, 'lat_max': 40.430, 'lon_min': -3.650, 'lon_max': -3.620},
            'Ciudad Lineal': {'lat_min': 40.430, 'lat_max': 40.460, 'lon_min': -3.650, 'lon_max': -3.620},
            'Hortaleza': {'lat_min': 40.460, 'lat_max': 40.500, 'lon_min': -3.650, 'lon_max': -3.620},
            'Villaverde': {'lat_min': 40.320, 'lat_max': 40.360, 'lon_min': -3.720, 'lon_max': -3.680},
        }
        # Find matching district
        for district_name, bounds in districts.items():
            if (bounds['lat_min'] <= lat <= bounds['lat_max'] and 
                bounds['lon_min'] <= lon <= bounds['lon_max']):
                return district_name
        # Default for coordinates in Madrid but not matching specific districts
        if 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5:
            return 'Other Madrid'
        return None
    def classify_road_type(self, measurement_point_name: str) -> str:
        """Classify road type based on measurement point name"""
        if not measurement_point_name:
            return 'URB'  # Default to urban
        name_upper = measurement_point_name.upper()
        # Highway patterns
        if any(pattern in name_upper for pattern in ['A-', 'AP-', 'AUTOPISTA', 'AUTOVIA']):
            return 'A'
        # M-30 Ring road
        if 'M-30' in name_upper or 'M30' in name_upper:
            return 'M30'
        # Other M roads (ring roads)
        if re.search(r'M-[0-9]', name_upper) or re.search(r'M[0-9]', name_upper):
            return 'C30'
        # Radial roads (R-1, R-2, etc.)
        if re.search(r'R-[0-9]', name_upper) or 'RADIAL' in name_upper:
            return 'R'
        # Default to urban street
        return 'URB'
    def validate_madrid_coordinates(self, lat: float, lon: float) -> bool:
        """Validate coordinates are within Madrid bounds"""
        # Madrid metropolitan area bounds
        return 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5
    def get_congestion_level(self, occupation_pct: float) -> str:
        """Convert occupation percentage to congestion level"""
        if occupation_pct >= 80:
            return CongestionLevel.BLOCKED.value
        elif occupation_pct >= 50:
            return CongestionLevel.HIGH.value
        elif occupation_pct >= 25:
            return CongestionLevel.MEDIUM.value
        else:
            return CongestionLevel.LOW.value
    def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
        """Calculate distance between two points in kilometers using Haversine formula"""
        R = 6371  # Earth's radius in kilometers
        dlat = math.radians(lat2 - lat1)
        dlon = math.radians(lon2 - lon1)
        a = (math.sin(dlat/2) * math.sin(dlat/2) + 
             math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * 
             math.sin(dlon/2) * math.sin(dlon/2))
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        return R * c
    def find_nearest_traffic_point(self, traffic_points: List[Dict[str, Any]], 
                                  latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
        """Find the nearest traffic point to given coordinates"""
        if not traffic_points:
            return None
        min_distance = float('inf')
        nearest_point = None
        for point in traffic_points:
            point_lat = point.get('latitude')
            point_lon = point.get('longitude')
            if point_lat and point_lon:
                distance = self.calculate_distance(latitude, longitude, point_lat, point_lon)
                if distance < min_distance:
                    min_distance = distance
                    nearest_point = point
        return nearest_point
    def find_nearest_measurement_points(self, measurement_points: Dict[str, Dict[str, Any]], 
                                      latitude: float, longitude: float, 
                                      num_points: int = 3, max_distance_km: Optional[float] = 5.0) -> List[Tuple[str, Dict[str, Any], float]]:
        """Find nearest measurement points for historical data"""
        distances = []
        for point_id, point_data in measurement_points.items():
            point_lat = point_data.get('latitude')
            point_lon = point_data.get('longitude')
            if point_lat and point_lon:
                distance_km = self.calculate_distance(latitude, longitude, point_lat, point_lon)
                distances.append((point_id, point_data, distance_km))
        # Sort by distance and take nearest points
        distances.sort(key=lambda x: x[2])
        # Apply distance filter if specified
        if max_distance_km is not None:
            distances = [p for p in distances if p[2] <= max_distance_km]
        nearest = distances[:num_points]
        self.logger.info("Found nearest measurement points", 
                        count=len(nearest),
                        nearest_distance_km=nearest[0][2] if nearest else None)
        return nearest
--- a/services/data/app/external/processors/madrid_processor.py
+++ b/services/data/app/external/processors/madrid_processor.py
@@ -0,0 +1,478 @@
 # ================================================================
 # services/data/app/external/processors/madrid_processor.py
 # ================================================================
 """
 Data transformation and parsing for Madrid traffic data
 Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring
 """
 import csv
 import io
 import math
 import re
 import xml.etree.ElementTree as ET
 import zipfile
 from datetime import datetime, timezone
 from typing import Dict, List, Any, Optional, Tuple
 import structlog
 import pyproj
 from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel
 class MadridTrafficDataProcessor:
    """Handles all data transformation and parsing for Madrid traffic data"""
    def __init__(self):
        self.logger = structlog.get_logger()
        # UTM Zone 30N (Madrid's coordinate system)
        self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84')
        self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
    def safe_int(self, value: str) -> int:
        """Safely convert string to int"""
        try:
            return int(float(value.replace(',', '.')))
        except (ValueError, TypeError):
            return 0
    def _safe_float(self, value: str) -> float:
        """Safely convert string to float"""
        try:
            return float(value.replace(',', '.'))
        except (ValueError, TypeError):
            return 0.0
    def clean_madrid_xml(self, xml_content: str) -> str:
        """Clean and prepare Madrid XML content for parsing"""
        if not xml_content:
            return ""
        # Remove BOM and extra whitespace
        cleaned = xml_content.strip()
        if cleaned.startswith('\ufeff'):
            cleaned = cleaned[1:]
        # Fix common XML issues
        cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&amp;', cleaned)
        # Ensure proper encoding declaration
        if not cleaned.startswith('<?xml'):
            cleaned = '<?xml version="1.0" encoding="UTF-8"?>\n' + cleaned
        return cleaned
    def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]:
        """Convert UTM coordinates to latitude/longitude"""
        try:
            utm_x_float = float(utm_x.replace(',', '.'))
            utm_y_float = float(utm_y.replace(',', '.'))
            # Convert from UTM Zone 30N to WGS84
            longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float)
            # Validate coordinates are in Madrid area
            if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5:
                return latitude, longitude
            else:
                self.logger.debug("Coordinates outside Madrid bounds", 
                                lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y)
                return None, None
        except Exception as e:
            self.logger.debug("UTM conversion error", 
                            utm_x=utm_x, utm_y=utm_y, error=str(e))
            return None, None
    def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
        """Parse Madrid traffic XML data"""
        traffic_points = []
        try:
            cleaned_xml = self.clean_madrid_xml(xml_content)
            root = ET.fromstring(cleaned_xml)
            self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root)))
            if root.tag == 'pms':
                pm_elements = root.findall('pm')
                self.logger.debug("Found PM elements", count=len(pm_elements))
                for pm in pm_elements:
                    try:
                        traffic_point = self._extract_madrid_pm_element(pm)
                        if self._is_valid_traffic_point(traffic_point):
                            traffic_points.append(traffic_point)
                            # Log first few points for debugging
                            if len(traffic_points) <= 3:
                                self.logger.debug("Sample traffic point", 
                                               id=traffic_point['idelem'],
                                               lat=traffic_point['latitude'],
                                               lon=traffic_point['longitude'],
                                               intensity=traffic_point.get('intensidad'))
                    except Exception as e:
                        self.logger.debug("Error parsing PM element", error=str(e))
                        continue
            else:
                self.logger.warning("Unexpected XML root tag", root_tag=root.tag)
            self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
            return traffic_points
        except ET.ParseError as e:
            self.logger.warning("Failed to parse Madrid XML", error=str(e))
            return self._extract_traffic_data_regex(xml_content)
        except Exception as e:
            self.logger.error("Error in Madrid traffic XML parsing", error=str(e))
            return []
    def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
        """Extract traffic data from Madrid <pm> element with coordinate conversion"""
        try:
            point_data = {}
            utm_x = utm_y = None
            # Extract all child elements
            for child in pm_element:
                tag, text = child.tag, child.text.strip() if child.text else ''
                if tag == 'idelem':
                    point_data['idelem'] = text
                elif tag == 'descripcion':
                    point_data['descripcion'] = text
                elif tag == 'intensidad':
                    point_data['intensidad'] = self.safe_int(text)
                elif tag == 'ocupacion':
                    point_data['ocupacion'] = self._safe_float(text)
                elif tag == 'carga':
                    point_data['carga'] = self.safe_int(text)
                elif tag == 'nivelServicio':
                    point_data['nivelServicio'] = self.safe_int(text)
                elif tag == 'st_x':  # UTM X coordinate
                    utm_x = text
                    point_data['utm_x'] = text
                elif tag == 'st_y':  # UTM Y coordinate
                    utm_y = text
                    point_data['utm_y'] = text
                elif tag == 'error':
                    point_data['error'] = text
                elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']:
                    point_data[tag] = text
            # Convert coordinates
            if utm_x and utm_y:
                latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y)
                if latitude and longitude:
                    point_data.update({
                        'latitude': latitude, 
                        'longitude': longitude,
                        'measurement_point_id': point_data.get('idelem'),
                        'measurement_point_name': point_data.get('descripcion'),
                        'timestamp': datetime.now(timezone.utc),
                        'source': 'madrid_opendata_xml'
                    })
                    return point_data
                else:
                    self.logger.debug("Invalid coordinates after conversion", 
                                   idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y)
                    return {}
            else:
                self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem'))
                return {}
        except Exception as e:
            self.logger.debug("Error extracting PM element", error=str(e))
            return {}
    def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool:
        """Validate traffic point data"""
        required_fields = ['idelem', 'latitude', 'longitude']
        return all(field in traffic_point and traffic_point[field] for field in required_fields)
    def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
        """Fallback regex-based extraction if XML parsing fails"""
        traffic_points = []
        try:
            # Pattern to match PM elements
            pm_pattern = r'<pm>(.*?)</pm>'
            pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)
            for pm_content in pm_matches:
                traffic_point = {}
                # Extract key fields
                patterns = {
                    'idelem': r'<idelem>(.*?)</idelem>',
                    'descripcion': r'<descripcion>(.*?)</descripcion>',
                    'intensidad': r'<intensidad>(.*?)</intensidad>',
                    'ocupacion': r'<ocupacion>(.*?)</ocupacion>',
                    'st_x': r'<st_x>(.*?)</st_x>',
                    'st_y': r'<st_y>(.*?)</st_y>'
                }
                for field, pattern in patterns.items():
                    match = re.search(pattern, pm_content)
                    if match:
                        traffic_point[field] = match.group(1).strip()
                # Convert coordinates
                if 'st_x' in traffic_point and 'st_y' in traffic_point:
                    latitude, longitude = self.convert_utm_to_latlon(
                        traffic_point['st_x'], traffic_point['st_y']
                    )
                    if latitude and longitude:
                        traffic_point.update({
                            'latitude': latitude,
                            'longitude': longitude,
                            'intensidad': self.safe_int(traffic_point.get('intensidad', '0')),
                            'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')),
                            'measurement_point_id': traffic_point.get('idelem'),
                            'measurement_point_name': traffic_point.get('descripcion'),
                            'timestamp': datetime.now(timezone.utc),
                            'source': 'madrid_opendata_xml_regex'
                        })
                        traffic_points.append(traffic_point)
            self.logger.debug("Regex extraction completed", points=len(traffic_points))
            return traffic_points
        except Exception as e:
            self.logger.error("Error in regex extraction", error=str(e))
            return []
    def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]:
        """Parse measurement points CSV into lookup dictionary"""
        measurement_points = {}
        try:
            # Parse CSV with semicolon delimiter
            csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';')
            processed_count = 0
            for row in csv_reader:
                try:
                    # Extract point ID and coordinates
                    point_id = row.get('id', '').strip()
                    if not point_id:
                        continue
                    processed_count += 1
                    # Try different coordinate field names
                    lat_str = ''
                    lon_str = ''
                    # Common coordinate field patterns
                    lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y']
                    lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x']
                    for field in lat_fields:
                        if field in row and row[field].strip():
                            lat_str = row[field].strip()
                            break
                    for field in lon_fields:
                        if field in row and row[field].strip():
                            lon_str = row[field].strip()
                            break
                    if lat_str and lon_str:
                        try:
                            # Try direct lat/lon first
                            latitude = self._safe_float(lat_str)
                            longitude = self._safe_float(lon_str)
                            # If values look like UTM coordinates, convert them
                            if latitude > 1000 or longitude > 1000:
                                latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str)
                                if not latitude or not longitude:
                                    continue
                            # Validate Madrid area
                            if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5):
                                continue
                            measurement_points[point_id] = {
                                'id': point_id,
                                'latitude': latitude,
                                'longitude': longitude,
                                'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")),
                                'type': row.get('tipo', 'traffic'),
                                'raw_data': dict(row)  # Keep original data
                            }
                        except Exception as e:
                            self.logger.debug("Error processing point coordinates", 
                                            point_id=point_id, error=str(e))
                            continue
                except Exception as e:
                    self.logger.debug("Error processing CSV row", error=str(e))
                    continue
            self.logger.info("Parsed measurement points registry", 
                           total_points=len(measurement_points))
            return measurement_points
        except Exception as e:
            self.logger.error("Error parsing measurement points CSV", error=str(e))
            return {}
    def calculate_data_quality_score(self, row: Dict[str, str]) -> float:
        """Calculate data quality score for a traffic record"""
        try:
            score = 1.0
            # Check for missing or invalid values
            intensidad = row.get('intensidad', '').strip()
            if not intensidad or intensidad in ['N', '', '0']:
                score *= 0.7
            ocupacion = row.get('ocupacion', '').strip()
            if not ocupacion or ocupacion in ['N', '', '0']:
                score *= 0.8
            error_status = row.get('error', '').strip()
            if error_status and error_status != 'N':
                score *= 0.6
            # Check for reasonable value ranges
            try:
                intensidad_val = self.safe_int(intensidad)
                if intensidad_val < 0 or intensidad_val > 5000:  # Unrealistic traffic volume
                    score *= 0.7
                ocupacion_val = self.safe_int(ocupacion)
                if ocupacion_val < 0 or ocupacion_val > 100:  # Invalid percentage
                    score *= 0.5
            except:
                score *= 0.6
            return max(0.1, score)  # Minimum quality score
        except Exception as e:
            self.logger.debug("Error calculating quality score", error=str(e))
            return 0.5  # Default medium quality
    async def process_csv_content_chunked(self, text_content: str, csv_filename: str,
                                        nearest_ids: set, nearest_points: list) -> list:
        """Process CSV content in chunks to prevent memory issues"""
        import csv
        import io
        import gc
        try:
            csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';')
            chunk_size = 10000
            chunk_records = []
            all_records = []
            processed_count = 0
            total_rows_seen = 0
            for row in csv_reader:
                total_rows_seen += 1
                measurement_point_id = row.get('id', '').strip()
                if measurement_point_id not in nearest_ids:
                    continue
                try:
                    record_data = await self.parse_historical_csv_row(row, nearest_points)
                    if record_data:
                        chunk_records.append(record_data)
                        processed_count += 1
                        if len(chunk_records) >= chunk_size:
                            all_records.extend(chunk_records)
                            chunk_records = []
                            gc.collect()
                except Exception as e:
                    if processed_count < 5:
                        self.logger.error("Row parsing exception", 
                                        row_num=total_rows_seen,
                                        measurement_point_id=measurement_point_id,
                                        error=str(e))
                    continue
            # Process remaining records
            if chunk_records:
                all_records.extend(chunk_records)
                chunk_records = []
                gc.collect()
            self.logger.info("Processed CSV file", 
                           filename=csv_filename,
                           total_rows_read=total_rows_seen,
                           processed_records=processed_count)
            return all_records
        except Exception as e:
            self.logger.error("Error processing CSV content", 
                            filename=csv_filename, error=str(e))
            return []
    async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict:
        """Parse a single row from Madrid's historical traffic CSV"""
        try:
            # Extract date
            fecha_str = row.get('fecha', '').strip()
            if not fecha_str:
                return None
            try:
                from datetime import datetime, timezone
                date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S')
                date_obj = date_obj.replace(tzinfo=timezone.utc)
            except Exception:
                return None
            measurement_point_id = row.get('id', '').strip()
            # Find point data
            point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None)
            if not point_match:
                return None
            point_data = point_match[1]
            distance_km = point_match[2]
            # Extract traffic data
            intensidad = self.safe_int(row.get('intensidad', '0'))
            ocupacion = self.safe_int(row.get('ocupacion', '0'))
            carga = self.safe_int(row.get('carga', '0'))
            vmed = self.safe_int(row.get('vmed', '0'))
            # Build basic result (business logic will be applied elsewhere)
            result = {
                'date': date_obj,
                'measurement_point_id': measurement_point_id,
                'point_data': point_data,
                'distance_km': distance_km,
                'traffic_data': {
                    'intensidad': intensidad,
                    'ocupacion': ocupacion,
                    'carga': carga,
                    'vmed': vmed
                },
                'data_quality_score': self.calculate_data_quality_score(row),
                'raw_row': row
            }
            return result
        except Exception as e:
            self.logger.debug("Error parsing historical CSV row", error=str(e))
            return None
--- a/services/forecasting/app/services/forecasting_service.py
+++ b/services/forecasting/app/services/forecasting_service.py
@@ -639,10 +639,17 @@ class EnhancedForecastingService:
        if precipitation > 2.0:
            adjustment_factor *= 0.7
-        # Apply adjustments
+        # Apply adjustments to prediction
        adjusted_prediction = max(0, base_prediction * adjustment_factor)
-        adjusted_lower = max(0, lower_bound * adjustment_factor)
+        
-        adjusted_upper = max(0, upper_bound * adjustment_factor)
+        # For confidence bounds, preserve relative interval width while respecting minimum bounds
        original_interval = upper_bound - lower_bound
        adjusted_interval = original_interval * adjustment_factor
        # Ensure minimum reasonable lower bound (at least 20% of prediction or 5, whichever is larger)
        min_lower_bound = max(adjusted_prediction * 0.2, 5.0)
        adjusted_lower = max(min_lower_bound, adjusted_prediction - (adjusted_interval / 2))
        adjusted_upper = max(adjusted_lower + 10, adjusted_prediction + (adjusted_interval / 2))
        return {
            "prediction": adjusted_prediction,
--- a/services/training/app/ml/prophet_manager.py
+++ b/services/training/app/ml/prophet_manager.py
@@ -162,7 +162,8 @@ class BakeryProphetManager:
                'seasonality_mode': 'additive',
                'daily_seasonality': False,
                'weekly_seasonality': True,
-                'yearly_seasonality': False
+                'yearly_seasonality': False,
                'uncertainty_samples': 100  # ✅ FIX: Minimal uncertainty sampling for very sparse data
            }
        elif zero_ratio > 0.6:
            logger.info(f"Moderate sparsity for {product_name}, using conservative optimization")
@@ -174,7 +175,8 @@ class BakeryProphetManager:
                'seasonality_mode': 'additive',
                'daily_seasonality': False,
                'weekly_seasonality': True,
-                'yearly_seasonality': len(df) > 365  # Only if we have enough data
+                'yearly_seasonality': len(df) > 365,  # Only if we have enough data
                'uncertainty_samples': 200  # ✅ FIX: Conservative uncertainty sampling for moderately sparse data
            }
        # Use unique seed for each product to avoid identical results
@@ -196,6 +198,16 @@ class BakeryProphetManager:
                    changepoint_scale_range = (0.001, 0.5)
                    seasonality_scale_range = (0.01, 10.0)
                # ✅ FIX: Determine appropriate uncertainty samples range based on product category
                if product_category == 'high_volume':
                    uncertainty_range = (300, 800)  # More samples for stable high-volume products
                elif product_category == 'medium_volume':
                    uncertainty_range = (200, 500)  # Moderate samples for medium volume
                elif product_category == 'low_volume':
                    uncertainty_range = (150, 300)  # Fewer samples for low volume
                else:  # intermittent
                    uncertainty_range = (100, 200)  # Minimal samples for intermittent demand
                params = {
                    'changepoint_prior_scale': trial.suggest_float(
                        'changepoint_prior_scale', 
@@ -214,7 +226,8 @@ class BakeryProphetManager:
                    'seasonality_mode': 'additive' if product_category == 'high_volume' else trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative']),
                    'daily_seasonality': trial.suggest_categorical('daily_seasonality', [True, False]),
                    'weekly_seasonality': True,  # Always keep weekly
-                    'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False])
+                    'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False]),
                    'uncertainty_samples': trial.suggest_int('uncertainty_samples', uncertainty_range[0], uncertainty_range[1])  # ✅ FIX: Adaptive uncertainty sampling
                }
                # Simple 2-fold cross-validation for speed
@@ -229,8 +242,10 @@ class BakeryProphetManager:
                        continue
                    try:
-                        # Create and train model
+                        # Create and train model with adaptive uncertainty sampling
-                        model = Prophet(**params, interval_width=0.8, uncertainty_samples=100)
+                        uncertainty_samples = params.get('uncertainty_samples', 200)  # ✅ FIX: Use adaptive uncertainty samples
                        model = Prophet(**{k: v for k, v in params.items() if k != 'uncertainty_samples'}, 
                                      interval_width=0.8, uncertainty_samples=uncertainty_samples)
                        for regressor in regressor_columns:
                            if regressor in train_data.columns:
@@ -291,6 +306,12 @@ class BakeryProphetManager:
        logger.info(f"Optimization completed for {product_name}. Best score: {best_score:.2f}%. "
                   f"Parameters: {best_params}")
        # ✅ FIX: Log uncertainty sampling configuration for debugging confidence intervals
        uncertainty_samples = best_params.get('uncertainty_samples', 500)
        logger.info(f"Prophet model will use {uncertainty_samples} uncertainty samples for {product_name} "
                   f"(category: {product_category}, zero_ratio: {zero_ratio:.2f})")
        return best_params
    def _classify_product(self, product_name: str, sales_data: pd.DataFrame) -> str:
@@ -329,9 +350,12 @@ class BakeryProphetManager:
            return 'intermittent'
    def _create_optimized_prophet_model(self, optimized_params: Dict[str, Any], regressor_columns: List[str]) -> Prophet:
-        """Create Prophet model with optimized parameters"""
+        """Create Prophet model with optimized parameters and adaptive uncertainty sampling"""
        holidays = self._get_spanish_holidays()
        # Determine uncertainty samples based on data characteristics
        uncertainty_samples = optimized_params.get('uncertainty_samples', 500)
        model = Prophet(
            holidays=holidays if not holidays.empty else None,
            daily_seasonality=optimized_params.get('daily_seasonality', True),
@@ -344,7 +368,7 @@ class BakeryProphetManager:
            changepoint_range=optimized_params.get('changepoint_range', 0.8),
            interval_width=0.8,
            mcmc_samples=0,
-            uncertainty_samples=1000
+            uncertainty_samples=uncertainty_samples
        )
        return model