Refactor the traffic fetching system

2025-08-10 18:32:47 +02:00
parent 3c2acc934a
commit 8d125ab0d5
10 changed files with 1356 additions and 1574 deletions
--- a/services/data/app/external/processors/init.py
+++ b/services/data/app/external/processors/init.py
@@ -0,0 +1,14 @@
+# ================================================================
+# services/data/app/external/processors/__init__.py
+# ================================================================
+"""
+Data processors package
+"""
+
+from .madrid_processor import MadridTrafficDataProcessor
+from .madrid_business_logic import MadridTrafficAnalyzer
+
+__all__ = [
+    'MadridTrafficDataProcessor',
+    'MadridTrafficAnalyzer'
+]
--- a/services/data/app/external/processors/madrid_business_logic.py
+++ b/services/data/app/external/processors/madrid_business_logic.py
@@ -0,0 +1,346 @@
+# ================================================================
+# services/data/app/external/processors/madrid_business_logic.py
+# ================================================================
+"""
+Business rules, inference, and domain logic for Madrid traffic data
+Handles pedestrian inference, district mapping, road classification, and validation
+"""
+
+import math
+import re
+from datetime import datetime
+from typing import Dict, List, Any, Optional, Tuple
+import structlog
+
+from ..models.madrid_models import TrafficRecord, CongestionLevel
+
+
+class MadridTrafficAnalyzer:
+    """Handles business logic for Madrid traffic analysis"""
+    
+    # Madrid district characteristics for pedestrian patterns
+    DISTRICT_MULTIPLIERS = {
+        'Centro': 2.5,      # Historic center, high pedestrian activity
+        'Salamanca': 2.0,   # Shopping area, high foot traffic
+        'Chamberí': 1.8,    # Business district
+        'Retiro': 2.2,      # Near park, high leisure activity
+        'Chamartín': 1.6,   # Business/residential
+        'Tetuán': 1.4,      # Mixed residential/commercial
+        'Fuencarral': 1.3,  # Residential with commercial areas
+        'Moncloa': 1.7,     # University area
+        'Latina': 1.5,      # Residential area
+        'Carabanchel': 1.2, # Residential periphery
+        'Usera': 1.1,       # Industrial/residential
+        'Villaverde': 1.0,  # Industrial area
+        'Villa de Vallecas': 1.0,  # Peripheral residential
+        'Vicálvaro': 0.9,   # Peripheral
+        'San Blas': 1.1,    # Residential
+        'Barajas': 0.8,     # Airport area, low pedestrian activity
+        'Hortaleza': 1.2,   # Mixed area
+        'Ciudad Lineal': 1.3, # Linear development
+        'Puente de Vallecas': 1.2, # Working class area
+        'Moratalaz': 1.1,   # Residential
+        'Arganzuela': 1.6,  # Near center, growing area
+    }
+    
+    # Time-based patterns (hour of day)
+    TIME_PATTERNS = {
+        'morning_peak': {'hours': [7, 8, 9], 'multiplier': 2.0},
+        'lunch_peak': {'hours': [12, 13, 14], 'multiplier': 2.5},
+        'evening_peak': {'hours': [18, 19, 20], 'multiplier': 2.2},
+        'afternoon': {'hours': [15, 16, 17], 'multiplier': 1.8},
+        'late_evening': {'hours': [21, 22], 'multiplier': 1.5},
+        'night': {'hours': [23, 0, 1, 2, 3, 4, 5, 6], 'multiplier': 0.3},
+        'morning': {'hours': [10, 11], 'multiplier': 1.4}
+    }
+    
+    # Road type specific patterns
+    ROAD_TYPE_BASE = {
+        'URB': 250,    # Urban streets - high pedestrian activity
+        'M30': 50,     # Ring road - minimal pedestrians
+        'C30': 75,     # Secondary ring - some pedestrian access
+        'A': 25,       # Highways - very low pedestrians
+        'R': 40        # Radial roads - low to moderate
+    }
+    
+    # Weather impact on pedestrian activity
+    WEATHER_IMPACT = {
+        'rain': 0.6,        # 40% reduction in rain
+        'hot_weather': 0.8, # 20% reduction when very hot
+        'cold_weather': 0.7, # 30% reduction when very cold
+        'normal': 1.0       # No impact
+    }
+
+    def __init__(self):
+        self.logger = structlog.get_logger()
+
+    def calculate_pedestrian_flow(
+        self, 
+        traffic_record: TrafficRecord,
+        location_context: Optional[Dict[str, Any]] = None
+    ) -> Tuple[int, Dict[str, float]]:
+        """
+        Calculate pedestrian flow estimate with detailed metadata
+        
+        Returns:
+            Tuple of (pedestrian_count, inference_metadata)
+        """
+        # Base calculation from road type
+        road_type = traffic_record.road_type or 'URB'
+        base_pedestrians = self.ROAD_TYPE_BASE.get(road_type, 200)
+        
+        # Time pattern adjustment
+        hour = traffic_record.date.hour
+        time_factor = self._get_time_pattern_factor(hour)
+        
+        # District adjustment (if available)
+        district_factor = 1.0
+        district = traffic_record.district or self.infer_district_from_location(location_context)
+        if district:
+            district_factor = self.DISTRICT_MULTIPLIERS.get(district, 1.0)
+        
+        # Traffic correlation adjustment
+        traffic_factor = self._calculate_traffic_correlation(traffic_record)
+        
+        # Weather adjustment (if data available)
+        weather_factor = self._get_weather_factor(traffic_record.date, location_context)
+        
+        # Weekend adjustment
+        weekend_factor = self._get_weekend_factor(traffic_record.date)
+        
+        # Combined calculation
+        pedestrian_count = int(
+            base_pedestrians * 
+            time_factor * 
+            district_factor * 
+            traffic_factor * 
+            weather_factor * 
+            weekend_factor
+        )
+        
+        # Ensure reasonable bounds
+        pedestrian_count = max(10, min(2000, pedestrian_count))
+        
+        # Metadata for model training
+        inference_metadata = {
+            'base_pedestrians': base_pedestrians,
+            'time_factor': time_factor,
+            'district_factor': district_factor,
+            'traffic_factor': traffic_factor,
+            'weather_factor': weather_factor,
+            'weekend_factor': weekend_factor,
+            'inferred_district': district,
+            'hour': hour,
+            'road_type': road_type
+        }
+        
+        return pedestrian_count, inference_metadata
+
+    def _get_time_pattern_factor(self, hour: int) -> float:
+        """Get time-based pedestrian activity multiplier"""
+        for pattern, config in self.TIME_PATTERNS.items():
+            if hour in config['hours']:
+                return config['multiplier']
+        return 1.0  # Default multiplier
+
+    def _calculate_traffic_correlation(self, traffic_record: TrafficRecord) -> float:
+        """
+        Calculate pedestrian correlation with traffic patterns
+        Higher traffic in urban areas often correlates with more pedestrians
+        """
+        if traffic_record.road_type == 'URB':
+            # Urban areas: moderate traffic indicates commercial activity
+            if 30 <= traffic_record.load_percentage <= 70:
+                return 1.3  # Sweet spot for pedestrian activity
+            elif traffic_record.load_percentage > 70:
+                return 0.9  # Too congested, pedestrians avoid
+            else:
+                return 1.0  # Normal correlation
+        else:
+            # Highway/ring roads: more traffic = fewer pedestrians
+            if traffic_record.load_percentage > 60:
+                return 0.5
+            else:
+                return 0.8
+
+    def _get_weather_factor(self, date: datetime, location_context: Optional[Dict] = None) -> float:
+        """Estimate weather impact on pedestrian activity"""
+        # Simplified weather inference based on season and typical Madrid patterns
+        month = date.month
+        
+        # Madrid seasonal patterns
+        if month in [12, 1, 2]:  # Winter - cold weather impact
+            return self.WEATHER_IMPACT['cold_weather']
+        elif month in [7, 8]:    # Summer - hot weather impact
+            return self.WEATHER_IMPACT['hot_weather']
+        elif month in [10, 11, 3, 4]:  # Rainy seasons - moderate impact
+            return 0.85
+        else:  # Spring/early summer - optimal weather
+            return 1.1
+
+    def _get_weekend_factor(self, date: datetime) -> float:
+        """Weekend vs weekday pedestrian patterns"""
+        weekday = date.weekday()
+        hour = date.hour
+        
+        if weekday >= 5:  # Weekend
+            if 11 <= hour <= 16:  # Weekend shopping/leisure hours
+                return 1.4
+            elif 20 <= hour <= 23:  # Weekend evening activity
+                return 1.3
+            else:
+                return 0.9
+        else:  # Weekday
+            return 1.0
+
+    def infer_district_from_location(self, location_context: Optional[Dict] = None) -> Optional[str]:
+        """
+        Infer Madrid district from location context or coordinates
+        """
+        if not location_context:
+            return None
+        
+        lat = location_context.get('latitude')
+        lon = location_context.get('longitude')
+        
+        if not (lat and lon):
+            return None
+        
+        # Madrid district boundaries (simplified boundaries for inference)
+        districts = {
+            # Central districts
+            'Centro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.720, 'lon_max': -3.690},
+            'Arganzuela': {'lat_min': 40.385, 'lat_max': 40.410, 'lon_min': -3.720, 'lon_max': -3.680},
+            'Retiro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.690, 'lon_max': -3.660},
+            'Salamanca': {'lat_min': 40.420, 'lat_max': 40.445, 'lon_min': -3.690, 'lon_max': -3.660},
+            'Chamartín': {'lat_min': 40.445, 'lat_max': 40.480, 'lon_min': -3.690, 'lon_max': -3.660},
+            'Tetuán': {'lat_min': 40.445, 'lat_max': 40.470, 'lon_min': -3.720, 'lon_max': -3.690},
+            'Chamberí': {'lat_min': 40.425, 'lat_max': 40.450, 'lon_min': -3.720, 'lon_max': -3.690},
+            'Fuencarral-El Pardo': {'lat_min': 40.470, 'lat_max': 40.540, 'lon_min': -3.750, 'lon_max': -3.650},
+            'Moncloa-Aravaca': {'lat_min': 40.430, 'lat_max': 40.480, 'lon_min': -3.750, 'lon_max': -3.720},
+            'Latina': {'lat_min': 40.380, 'lat_max': 40.420, 'lon_min': -3.750, 'lon_max': -3.720},
+            'Carabanchel': {'lat_min': 40.350, 'lat_max': 40.390, 'lon_min': -3.750, 'lon_max': -3.720},
+            'Usera': {'lat_min': 40.350, 'lat_max': 40.385, 'lon_min': -3.720, 'lon_max': -3.690},
+            'Puente de Vallecas': {'lat_min': 40.370, 'lat_max': 40.410, 'lon_min': -3.680, 'lon_max': -3.640},
+            'Moratalaz': {'lat_min': 40.400, 'lat_max': 40.430, 'lon_min': -3.650, 'lon_max': -3.620},
+            'Ciudad Lineal': {'lat_min': 40.430, 'lat_max': 40.460, 'lon_min': -3.650, 'lon_max': -3.620},
+            'Hortaleza': {'lat_min': 40.460, 'lat_max': 40.500, 'lon_min': -3.650, 'lon_max': -3.620},
+            'Villaverde': {'lat_min': 40.320, 'lat_max': 40.360, 'lon_min': -3.720, 'lon_max': -3.680},
+        }
+        
+        # Find matching district
+        for district_name, bounds in districts.items():
+            if (bounds['lat_min'] <= lat <= bounds['lat_max'] and 
+                bounds['lon_min'] <= lon <= bounds['lon_max']):
+                return district_name
+        
+        # Default for coordinates in Madrid but not matching specific districts
+        if 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5:
+            return 'Other Madrid'
+        
+        return None
+
+    def classify_road_type(self, measurement_point_name: str) -> str:
+        """Classify road type based on measurement point name"""
+        if not measurement_point_name:
+            return 'URB'  # Default to urban
+        
+        name_upper = measurement_point_name.upper()
+        
+        # Highway patterns
+        if any(pattern in name_upper for pattern in ['A-', 'AP-', 'AUTOPISTA', 'AUTOVIA']):
+            return 'A'
+        
+        # M-30 Ring road
+        if 'M-30' in name_upper or 'M30' in name_upper:
+            return 'M30'
+        
+        # Other M roads (ring roads)
+        if re.search(r'M-[0-9]', name_upper) or re.search(r'M[0-9]', name_upper):
+            return 'C30'
+        
+        # Radial roads (R-1, R-2, etc.)
+        if re.search(r'R-[0-9]', name_upper) or 'RADIAL' in name_upper:
+            return 'R'
+        
+        # Default to urban street
+        return 'URB'
+
+    def validate_madrid_coordinates(self, lat: float, lon: float) -> bool:
+        """Validate coordinates are within Madrid bounds"""
+        # Madrid metropolitan area bounds
+        return 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5
+
+    def get_congestion_level(self, occupation_pct: float) -> str:
+        """Convert occupation percentage to congestion level"""
+        if occupation_pct >= 80:
+            return CongestionLevel.BLOCKED.value
+        elif occupation_pct >= 50:
+            return CongestionLevel.HIGH.value
+        elif occupation_pct >= 25:
+            return CongestionLevel.MEDIUM.value
+        else:
+            return CongestionLevel.LOW.value
+
+    def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
+        """Calculate distance between two points in kilometers using Haversine formula"""
+        R = 6371  # Earth's radius in kilometers
+        
+        dlat = math.radians(lat2 - lat1)
+        dlon = math.radians(lon2 - lon1)
+        a = (math.sin(dlat/2) * math.sin(dlat/2) + 
+             math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * 
+             math.sin(dlon/2) * math.sin(dlon/2))
+        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
+        
+        return R * c
+
+    def find_nearest_traffic_point(self, traffic_points: List[Dict[str, Any]], 
+                                  latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
+        """Find the nearest traffic point to given coordinates"""
+        if not traffic_points:
+            return None
+        
+        min_distance = float('inf')
+        nearest_point = None
+        
+        for point in traffic_points:
+            point_lat = point.get('latitude')
+            point_lon = point.get('longitude')
+            
+            if point_lat and point_lon:
+                distance = self.calculate_distance(latitude, longitude, point_lat, point_lon)
+                if distance < min_distance:
+                    min_distance = distance
+                    nearest_point = point
+        
+        return nearest_point
+
+    def find_nearest_measurement_points(self, measurement_points: Dict[str, Dict[str, Any]], 
+                                      latitude: float, longitude: float, 
+                                      num_points: int = 3, max_distance_km: Optional[float] = 5.0) -> List[Tuple[str, Dict[str, Any], float]]:
+        """Find nearest measurement points for historical data"""
+        distances = []
+        
+        for point_id, point_data in measurement_points.items():
+            point_lat = point_data.get('latitude')
+            point_lon = point_data.get('longitude')
+            
+            if point_lat and point_lon:
+                distance_km = self.calculate_distance(latitude, longitude, point_lat, point_lon)
+                distances.append((point_id, point_data, distance_km))
+        
+        # Sort by distance and take nearest points
+        distances.sort(key=lambda x: x[2])
+        
+        # Apply distance filter if specified
+        if max_distance_km is not None:
+            distances = [p for p in distances if p[2] <= max_distance_km]
+        
+        nearest = distances[:num_points]
+        
+        self.logger.info("Found nearest measurement points", 
+                        count=len(nearest),
+                        nearest_distance_km=nearest[0][2] if nearest else None)
+        
+        return nearest
--- a/services/data/app/external/processors/madrid_processor.py
+++ b/services/data/app/external/processors/madrid_processor.py
@@ -0,0 +1,478 @@
+# ================================================================
+# services/data/app/external/processors/madrid_processor.py
+# ================================================================
+"""
+Data transformation and parsing for Madrid traffic data
+Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring
+"""
+
+import csv
+import io
+import math
+import re
+import xml.etree.ElementTree as ET
+import zipfile
+from datetime import datetime, timezone
+from typing import Dict, List, Any, Optional, Tuple
+import structlog
+import pyproj
+
+from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel
+
+
+class MadridTrafficDataProcessor:
+    """Handles all data transformation and parsing for Madrid traffic data"""
+    
+    def __init__(self):
+        self.logger = structlog.get_logger()
+        # UTM Zone 30N (Madrid's coordinate system)
+        self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84')
+        self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
+
+    def safe_int(self, value: str) -> int:
+        """Safely convert string to int"""
+        try:
+            return int(float(value.replace(',', '.')))
+        except (ValueError, TypeError):
+            return 0
+
+    def _safe_float(self, value: str) -> float:
+        """Safely convert string to float"""
+        try:
+            return float(value.replace(',', '.'))
+        except (ValueError, TypeError):
+            return 0.0
+
+    def clean_madrid_xml(self, xml_content: str) -> str:
+        """Clean and prepare Madrid XML content for parsing"""
+        if not xml_content:
+            return ""
+        
+        # Remove BOM and extra whitespace
+        cleaned = xml_content.strip()
+        if cleaned.startswith('\ufeff'):
+            cleaned = cleaned[1:]
+        
+        # Fix common XML issues
+        cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&amp;', cleaned)
+        
+        # Ensure proper encoding declaration
+        if not cleaned.startswith('<?xml'):
+            cleaned = '<?xml version="1.0" encoding="UTF-8"?>\n' + cleaned
+        
+        return cleaned
+
+    def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]:
+        """Convert UTM coordinates to latitude/longitude"""
+        try:
+            utm_x_float = float(utm_x.replace(',', '.'))
+            utm_y_float = float(utm_y.replace(',', '.'))
+            
+            # Convert from UTM Zone 30N to WGS84
+            longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float)
+            
+            # Validate coordinates are in Madrid area
+            if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5:
+                return latitude, longitude
+            else:
+                self.logger.debug("Coordinates outside Madrid bounds", 
+                                lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y)
+                return None, None
+                
+        except Exception as e:
+            self.logger.debug("UTM conversion error", 
+                            utm_x=utm_x, utm_y=utm_y, error=str(e))
+            return None, None
+
+    def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
+        """Parse Madrid traffic XML data"""
+        traffic_points = []
+        
+        try:
+            cleaned_xml = self.clean_madrid_xml(xml_content)
+            root = ET.fromstring(cleaned_xml)
+            
+            self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root)))
+            
+            if root.tag == 'pms':
+                pm_elements = root.findall('pm')
+                self.logger.debug("Found PM elements", count=len(pm_elements))
+                
+                for pm in pm_elements:
+                    try:
+                        traffic_point = self._extract_madrid_pm_element(pm)
+                        
+                        if self._is_valid_traffic_point(traffic_point):
+                            traffic_points.append(traffic_point)
+                            
+                            # Log first few points for debugging
+                            if len(traffic_points) <= 3:
+                                self.logger.debug("Sample traffic point", 
+                                               id=traffic_point['idelem'],
+                                               lat=traffic_point['latitude'],
+                                               lon=traffic_point['longitude'],
+                                               intensity=traffic_point.get('intensidad'))
+                        
+                    except Exception as e:
+                        self.logger.debug("Error parsing PM element", error=str(e))
+                        continue
+            else:
+                self.logger.warning("Unexpected XML root tag", root_tag=root.tag)
+            
+            self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
+            return traffic_points
+            
+        except ET.ParseError as e:
+            self.logger.warning("Failed to parse Madrid XML", error=str(e))
+            return self._extract_traffic_data_regex(xml_content)
+        except Exception as e:
+            self.logger.error("Error in Madrid traffic XML parsing", error=str(e))
+            return []
+
+    def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
+        """Extract traffic data from Madrid <pm> element with coordinate conversion"""
+        try:
+            point_data = {}
+            utm_x = utm_y = None
+            
+            # Extract all child elements
+            for child in pm_element:
+                tag, text = child.tag, child.text.strip() if child.text else ''
+                
+                if tag == 'idelem':
+                    point_data['idelem'] = text
+                elif tag == 'descripcion':
+                    point_data['descripcion'] = text
+                elif tag == 'intensidad':
+                    point_data['intensidad'] = self.safe_int(text)
+                elif tag == 'ocupacion':
+                    point_data['ocupacion'] = self._safe_float(text)
+                elif tag == 'carga':
+                    point_data['carga'] = self.safe_int(text)
+                elif tag == 'nivelServicio':
+                    point_data['nivelServicio'] = self.safe_int(text)
+                elif tag == 'st_x':  # UTM X coordinate
+                    utm_x = text
+                    point_data['utm_x'] = text
+                elif tag == 'st_y':  # UTM Y coordinate
+                    utm_y = text
+                    point_data['utm_y'] = text
+                elif tag == 'error':
+                    point_data['error'] = text
+                elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']:
+                    point_data[tag] = text
+            
+            # Convert coordinates
+            if utm_x and utm_y:
+                latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y)
+                
+                if latitude and longitude:
+                    point_data.update({
+                        'latitude': latitude, 
+                        'longitude': longitude,
+                        'measurement_point_id': point_data.get('idelem'),
+                        'measurement_point_name': point_data.get('descripcion'),
+                        'timestamp': datetime.now(timezone.utc),
+                        'source': 'madrid_opendata_xml'
+                    })
+                    
+                    return point_data
+                else:
+                    self.logger.debug("Invalid coordinates after conversion", 
+                                   idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y)
+                    return {}
+            else:
+                self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem'))
+                return {}
+                
+        except Exception as e:
+            self.logger.debug("Error extracting PM element", error=str(e))
+            return {}
+
+    def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool:
+        """Validate traffic point data"""
+        required_fields = ['idelem', 'latitude', 'longitude']
+        return all(field in traffic_point and traffic_point[field] for field in required_fields)
+
+    def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
+        """Fallback regex-based extraction if XML parsing fails"""
+        traffic_points = []
+        
+        try:
+            # Pattern to match PM elements
+            pm_pattern = r'<pm>(.*?)</pm>'
+            pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)
+            
+            for pm_content in pm_matches:
+                traffic_point = {}
+                
+                # Extract key fields
+                patterns = {
+                    'idelem': r'<idelem>(.*?)</idelem>',
+                    'descripcion': r'<descripcion>(.*?)</descripcion>',
+                    'intensidad': r'<intensidad>(.*?)</intensidad>',
+                    'ocupacion': r'<ocupacion>(.*?)</ocupacion>',
+                    'st_x': r'<st_x>(.*?)</st_x>',
+                    'st_y': r'<st_y>(.*?)</st_y>'
+                }
+                
+                for field, pattern in patterns.items():
+                    match = re.search(pattern, pm_content)
+                    if match:
+                        traffic_point[field] = match.group(1).strip()
+                
+                # Convert coordinates
+                if 'st_x' in traffic_point and 'st_y' in traffic_point:
+                    latitude, longitude = self.convert_utm_to_latlon(
+                        traffic_point['st_x'], traffic_point['st_y']
+                    )
+                    
+                    if latitude and longitude:
+                        traffic_point.update({
+                            'latitude': latitude,
+                            'longitude': longitude,
+                            'intensidad': self.safe_int(traffic_point.get('intensidad', '0')),
+                            'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')),
+                            'measurement_point_id': traffic_point.get('idelem'),
+                            'measurement_point_name': traffic_point.get('descripcion'),
+                            'timestamp': datetime.now(timezone.utc),
+                            'source': 'madrid_opendata_xml_regex'
+                        })
+                        
+                        traffic_points.append(traffic_point)
+            
+            self.logger.debug("Regex extraction completed", points=len(traffic_points))
+            return traffic_points
+            
+        except Exception as e:
+            self.logger.error("Error in regex extraction", error=str(e))
+            return []
+
+    def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]:
+        """Parse measurement points CSV into lookup dictionary"""
+        measurement_points = {}
+        
+        try:
+            # Parse CSV with semicolon delimiter
+            csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';')
+            
+            processed_count = 0
+            for row in csv_reader:
+                try:
+                    # Extract point ID and coordinates
+                    point_id = row.get('id', '').strip()
+                    if not point_id:
+                        continue
+                    
+                    processed_count += 1
+                    
+                    # Try different coordinate field names
+                    lat_str = ''
+                    lon_str = ''
+                    
+                    # Common coordinate field patterns
+                    lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y']
+                    lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x']
+                    
+                    for field in lat_fields:
+                        if field in row and row[field].strip():
+                            lat_str = row[field].strip()
+                            break
+                    
+                    for field in lon_fields:
+                        if field in row and row[field].strip():
+                            lon_str = row[field].strip()
+                            break
+                    
+                    if lat_str and lon_str:
+                        try:
+                            # Try direct lat/lon first
+                            latitude = self._safe_float(lat_str)
+                            longitude = self._safe_float(lon_str)
+                            
+                            # If values look like UTM coordinates, convert them
+                            if latitude > 1000 or longitude > 1000:
+                                latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str)
+                                if not latitude or not longitude:
+                                    continue
+                            
+                            # Validate Madrid area
+                            if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5):
+                                continue
+                            
+                            measurement_points[point_id] = {
+                                'id': point_id,
+                                'latitude': latitude,
+                                'longitude': longitude,
+                                'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")),
+                                'type': row.get('tipo', 'traffic'),
+                                'raw_data': dict(row)  # Keep original data
+                            }
+                            
+                        except Exception as e:
+                            self.logger.debug("Error processing point coordinates", 
+                                            point_id=point_id, error=str(e))
+                            continue
+                
+                except Exception as e:
+                    self.logger.debug("Error processing CSV row", error=str(e))
+                    continue
+            
+            self.logger.info("Parsed measurement points registry", 
+                           total_points=len(measurement_points))
+            return measurement_points
+            
+        except Exception as e:
+            self.logger.error("Error parsing measurement points CSV", error=str(e))
+            return {}
+
+    def calculate_data_quality_score(self, row: Dict[str, str]) -> float:
+        """Calculate data quality score for a traffic record"""
+        try:
+            score = 1.0
+            
+            # Check for missing or invalid values
+            intensidad = row.get('intensidad', '').strip()
+            if not intensidad or intensidad in ['N', '', '0']:
+                score *= 0.7
+            
+            ocupacion = row.get('ocupacion', '').strip()
+            if not ocupacion or ocupacion in ['N', '', '0']:
+                score *= 0.8
+            
+            error_status = row.get('error', '').strip()
+            if error_status and error_status != 'N':
+                score *= 0.6
+            
+            # Check for reasonable value ranges
+            try:
+                intensidad_val = self.safe_int(intensidad)
+                if intensidad_val < 0 or intensidad_val > 5000:  # Unrealistic traffic volume
+                    score *= 0.7
+                
+                ocupacion_val = self.safe_int(ocupacion)
+                if ocupacion_val < 0 or ocupacion_val > 100:  # Invalid percentage
+                    score *= 0.5
+                    
+            except:
+                score *= 0.6
+            
+            return max(0.1, score)  # Minimum quality score
+            
+        except Exception as e:
+            self.logger.debug("Error calculating quality score", error=str(e))
+            return 0.5  # Default medium quality
+
+    async def process_csv_content_chunked(self, text_content: str, csv_filename: str,
+                                        nearest_ids: set, nearest_points: list) -> list:
+        """Process CSV content in chunks to prevent memory issues"""
+        import csv
+        import io
+        import gc
+        
+        try:
+            csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';')
+            
+            chunk_size = 10000
+            chunk_records = []
+            all_records = []
+            processed_count = 0
+            total_rows_seen = 0
+            
+            for row in csv_reader:
+                total_rows_seen += 1
+                measurement_point_id = row.get('id', '').strip()
+                
+                if measurement_point_id not in nearest_ids:
+                    continue
+                
+                try:
+                    record_data = await self.parse_historical_csv_row(row, nearest_points)
+                    
+                    if record_data:
+                        chunk_records.append(record_data)
+                        processed_count += 1
+                        
+                        if len(chunk_records) >= chunk_size:
+                            all_records.extend(chunk_records)
+                            chunk_records = []
+                            gc.collect()
+                
+                except Exception as e:
+                    if processed_count < 5:
+                        self.logger.error("Row parsing exception", 
+                                        row_num=total_rows_seen,
+                                        measurement_point_id=measurement_point_id,
+                                        error=str(e))
+                    continue
+            
+            # Process remaining records
+            if chunk_records:
+                all_records.extend(chunk_records)
+                chunk_records = []
+                gc.collect()
+            
+            self.logger.info("Processed CSV file", 
+                           filename=csv_filename,
+                           total_rows_read=total_rows_seen,
+                           processed_records=processed_count)
+            
+            return all_records
+            
+        except Exception as e:
+            self.logger.error("Error processing CSV content", 
+                            filename=csv_filename, error=str(e))
+            return []
+
+    async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict:
+        """Parse a single row from Madrid's historical traffic CSV"""
+        try:
+            # Extract date
+            fecha_str = row.get('fecha', '').strip()
+            if not fecha_str:
+                return None
+            
+            try:
+                from datetime import datetime, timezone
+                date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S')
+                date_obj = date_obj.replace(tzinfo=timezone.utc)
+            except Exception:
+                return None
+            
+            measurement_point_id = row.get('id', '').strip()
+            
+            # Find point data
+            point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None)
+            if not point_match:
+                return None
+            
+            point_data = point_match[1]
+            distance_km = point_match[2]
+            
+            # Extract traffic data
+            intensidad = self.safe_int(row.get('intensidad', '0'))
+            ocupacion = self.safe_int(row.get('ocupacion', '0'))
+            carga = self.safe_int(row.get('carga', '0'))
+            vmed = self.safe_int(row.get('vmed', '0'))
+            
+            # Build basic result (business logic will be applied elsewhere)
+            result = {
+                'date': date_obj,
+                'measurement_point_id': measurement_point_id,
+                'point_data': point_data,
+                'distance_km': distance_km,
+                'traffic_data': {
+                    'intensidad': intensidad,
+                    'ocupacion': ocupacion,
+                    'carga': carga,
+                    'vmed': vmed
+                },
+                'data_quality_score': self.calculate_data_quality_score(row),
+                'raw_row': row
+            }
+            
+            return result
+            
+        except Exception as e:
+            self.logger.debug("Error parsing historical CSV row", error=str(e))
+            return None