Refactor the traffic fetching system

This commit is contained in:
Urtzi Alfaro
2025-08-10 18:32:47 +02:00
parent 3c2acc934a
commit 8d125ab0d5
10 changed files with 1356 additions and 1574 deletions

View File

@@ -0,0 +1,14 @@
# ================================================================
# services/data/app/external/processors/__init__.py
# ================================================================
"""
Data processors package
"""
from .madrid_processor import MadridTrafficDataProcessor
from .madrid_business_logic import MadridTrafficAnalyzer
__all__ = [
'MadridTrafficDataProcessor',
'MadridTrafficAnalyzer'
]

View File

@@ -0,0 +1,346 @@
# ================================================================
# services/data/app/external/processors/madrid_business_logic.py
# ================================================================
"""
Business rules, inference, and domain logic for Madrid traffic data
Handles pedestrian inference, district mapping, road classification, and validation
"""
import math
import re
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
import structlog
from ..models.madrid_models import TrafficRecord, CongestionLevel
class MadridTrafficAnalyzer:
"""Handles business logic for Madrid traffic analysis"""
# Madrid district characteristics for pedestrian patterns
DISTRICT_MULTIPLIERS = {
'Centro': 2.5, # Historic center, high pedestrian activity
'Salamanca': 2.0, # Shopping area, high foot traffic
'Chamberí': 1.8, # Business district
'Retiro': 2.2, # Near park, high leisure activity
'Chamartín': 1.6, # Business/residential
'Tetuán': 1.4, # Mixed residential/commercial
'Fuencarral': 1.3, # Residential with commercial areas
'Moncloa': 1.7, # University area
'Latina': 1.5, # Residential area
'Carabanchel': 1.2, # Residential periphery
'Usera': 1.1, # Industrial/residential
'Villaverde': 1.0, # Industrial area
'Villa de Vallecas': 1.0, # Peripheral residential
'Vicálvaro': 0.9, # Peripheral
'San Blas': 1.1, # Residential
'Barajas': 0.8, # Airport area, low pedestrian activity
'Hortaleza': 1.2, # Mixed area
'Ciudad Lineal': 1.3, # Linear development
'Puente de Vallecas': 1.2, # Working class area
'Moratalaz': 1.1, # Residential
'Arganzuela': 1.6, # Near center, growing area
}
# Time-based patterns (hour of day)
TIME_PATTERNS = {
'morning_peak': {'hours': [7, 8, 9], 'multiplier': 2.0},
'lunch_peak': {'hours': [12, 13, 14], 'multiplier': 2.5},
'evening_peak': {'hours': [18, 19, 20], 'multiplier': 2.2},
'afternoon': {'hours': [15, 16, 17], 'multiplier': 1.8},
'late_evening': {'hours': [21, 22], 'multiplier': 1.5},
'night': {'hours': [23, 0, 1, 2, 3, 4, 5, 6], 'multiplier': 0.3},
'morning': {'hours': [10, 11], 'multiplier': 1.4}
}
# Road type specific patterns
ROAD_TYPE_BASE = {
'URB': 250, # Urban streets - high pedestrian activity
'M30': 50, # Ring road - minimal pedestrians
'C30': 75, # Secondary ring - some pedestrian access
'A': 25, # Highways - very low pedestrians
'R': 40 # Radial roads - low to moderate
}
# Weather impact on pedestrian activity
WEATHER_IMPACT = {
'rain': 0.6, # 40% reduction in rain
'hot_weather': 0.8, # 20% reduction when very hot
'cold_weather': 0.7, # 30% reduction when very cold
'normal': 1.0 # No impact
}
def __init__(self):
self.logger = structlog.get_logger()
def calculate_pedestrian_flow(
self,
traffic_record: TrafficRecord,
location_context: Optional[Dict[str, Any]] = None
) -> Tuple[int, Dict[str, float]]:
"""
Calculate pedestrian flow estimate with detailed metadata
Returns:
Tuple of (pedestrian_count, inference_metadata)
"""
# Base calculation from road type
road_type = traffic_record.road_type or 'URB'
base_pedestrians = self.ROAD_TYPE_BASE.get(road_type, 200)
# Time pattern adjustment
hour = traffic_record.date.hour
time_factor = self._get_time_pattern_factor(hour)
# District adjustment (if available)
district_factor = 1.0
district = traffic_record.district or self.infer_district_from_location(location_context)
if district:
district_factor = self.DISTRICT_MULTIPLIERS.get(district, 1.0)
# Traffic correlation adjustment
traffic_factor = self._calculate_traffic_correlation(traffic_record)
# Weather adjustment (if data available)
weather_factor = self._get_weather_factor(traffic_record.date, location_context)
# Weekend adjustment
weekend_factor = self._get_weekend_factor(traffic_record.date)
# Combined calculation
pedestrian_count = int(
base_pedestrians *
time_factor *
district_factor *
traffic_factor *
weather_factor *
weekend_factor
)
# Ensure reasonable bounds
pedestrian_count = max(10, min(2000, pedestrian_count))
# Metadata for model training
inference_metadata = {
'base_pedestrians': base_pedestrians,
'time_factor': time_factor,
'district_factor': district_factor,
'traffic_factor': traffic_factor,
'weather_factor': weather_factor,
'weekend_factor': weekend_factor,
'inferred_district': district,
'hour': hour,
'road_type': road_type
}
return pedestrian_count, inference_metadata
def _get_time_pattern_factor(self, hour: int) -> float:
"""Get time-based pedestrian activity multiplier"""
for pattern, config in self.TIME_PATTERNS.items():
if hour in config['hours']:
return config['multiplier']
return 1.0 # Default multiplier
def _calculate_traffic_correlation(self, traffic_record: TrafficRecord) -> float:
"""
Calculate pedestrian correlation with traffic patterns
Higher traffic in urban areas often correlates with more pedestrians
"""
if traffic_record.road_type == 'URB':
# Urban areas: moderate traffic indicates commercial activity
if 30 <= traffic_record.load_percentage <= 70:
return 1.3 # Sweet spot for pedestrian activity
elif traffic_record.load_percentage > 70:
return 0.9 # Too congested, pedestrians avoid
else:
return 1.0 # Normal correlation
else:
# Highway/ring roads: more traffic = fewer pedestrians
if traffic_record.load_percentage > 60:
return 0.5
else:
return 0.8
def _get_weather_factor(self, date: datetime, location_context: Optional[Dict] = None) -> float:
"""Estimate weather impact on pedestrian activity"""
# Simplified weather inference based on season and typical Madrid patterns
month = date.month
# Madrid seasonal patterns
if month in [12, 1, 2]: # Winter - cold weather impact
return self.WEATHER_IMPACT['cold_weather']
elif month in [7, 8]: # Summer - hot weather impact
return self.WEATHER_IMPACT['hot_weather']
elif month in [10, 11, 3, 4]: # Rainy seasons - moderate impact
return 0.85
else: # Spring/early summer - optimal weather
return 1.1
def _get_weekend_factor(self, date: datetime) -> float:
"""Weekend vs weekday pedestrian patterns"""
weekday = date.weekday()
hour = date.hour
if weekday >= 5: # Weekend
if 11 <= hour <= 16: # Weekend shopping/leisure hours
return 1.4
elif 20 <= hour <= 23: # Weekend evening activity
return 1.3
else:
return 0.9
else: # Weekday
return 1.0
def infer_district_from_location(self, location_context: Optional[Dict] = None) -> Optional[str]:
"""
Infer Madrid district from location context or coordinates
"""
if not location_context:
return None
lat = location_context.get('latitude')
lon = location_context.get('longitude')
if not (lat and lon):
return None
# Madrid district boundaries (simplified boundaries for inference)
districts = {
# Central districts
'Centro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.720, 'lon_max': -3.690},
'Arganzuela': {'lat_min': 40.385, 'lat_max': 40.410, 'lon_min': -3.720, 'lon_max': -3.680},
'Retiro': {'lat_min': 40.405, 'lat_max': 40.425, 'lon_min': -3.690, 'lon_max': -3.660},
'Salamanca': {'lat_min': 40.420, 'lat_max': 40.445, 'lon_min': -3.690, 'lon_max': -3.660},
'Chamartín': {'lat_min': 40.445, 'lat_max': 40.480, 'lon_min': -3.690, 'lon_max': -3.660},
'Tetuán': {'lat_min': 40.445, 'lat_max': 40.470, 'lon_min': -3.720, 'lon_max': -3.690},
'Chamberí': {'lat_min': 40.425, 'lat_max': 40.450, 'lon_min': -3.720, 'lon_max': -3.690},
'Fuencarral-El Pardo': {'lat_min': 40.470, 'lat_max': 40.540, 'lon_min': -3.750, 'lon_max': -3.650},
'Moncloa-Aravaca': {'lat_min': 40.430, 'lat_max': 40.480, 'lon_min': -3.750, 'lon_max': -3.720},
'Latina': {'lat_min': 40.380, 'lat_max': 40.420, 'lon_min': -3.750, 'lon_max': -3.720},
'Carabanchel': {'lat_min': 40.350, 'lat_max': 40.390, 'lon_min': -3.750, 'lon_max': -3.720},
'Usera': {'lat_min': 40.350, 'lat_max': 40.385, 'lon_min': -3.720, 'lon_max': -3.690},
'Puente de Vallecas': {'lat_min': 40.370, 'lat_max': 40.410, 'lon_min': -3.680, 'lon_max': -3.640},
'Moratalaz': {'lat_min': 40.400, 'lat_max': 40.430, 'lon_min': -3.650, 'lon_max': -3.620},
'Ciudad Lineal': {'lat_min': 40.430, 'lat_max': 40.460, 'lon_min': -3.650, 'lon_max': -3.620},
'Hortaleza': {'lat_min': 40.460, 'lat_max': 40.500, 'lon_min': -3.650, 'lon_max': -3.620},
'Villaverde': {'lat_min': 40.320, 'lat_max': 40.360, 'lon_min': -3.720, 'lon_max': -3.680},
}
# Find matching district
for district_name, bounds in districts.items():
if (bounds['lat_min'] <= lat <= bounds['lat_max'] and
bounds['lon_min'] <= lon <= bounds['lon_max']):
return district_name
# Default for coordinates in Madrid but not matching specific districts
if 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5:
return 'Other Madrid'
return None
def classify_road_type(self, measurement_point_name: str) -> str:
"""Classify road type based on measurement point name"""
if not measurement_point_name:
return 'URB' # Default to urban
name_upper = measurement_point_name.upper()
# Highway patterns
if any(pattern in name_upper for pattern in ['A-', 'AP-', 'AUTOPISTA', 'AUTOVIA']):
return 'A'
# M-30 Ring road
if 'M-30' in name_upper or 'M30' in name_upper:
return 'M30'
# Other M roads (ring roads)
if re.search(r'M-[0-9]', name_upper) or re.search(r'M[0-9]', name_upper):
return 'C30'
# Radial roads (R-1, R-2, etc.)
if re.search(r'R-[0-9]', name_upper) or 'RADIAL' in name_upper:
return 'R'
# Default to urban street
return 'URB'
def validate_madrid_coordinates(self, lat: float, lon: float) -> bool:
"""Validate coordinates are within Madrid bounds"""
# Madrid metropolitan area bounds
return 40.3 <= lat <= 40.6 and -3.8 <= lon <= -3.5
def get_congestion_level(self, occupation_pct: float) -> str:
"""Convert occupation percentage to congestion level"""
if occupation_pct >= 80:
return CongestionLevel.BLOCKED.value
elif occupation_pct >= 50:
return CongestionLevel.HIGH.value
elif occupation_pct >= 25:
return CongestionLevel.MEDIUM.value
else:
return CongestionLevel.LOW.value
def calculate_distance(self, lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Calculate distance between two points in kilometers using Haversine formula"""
R = 6371 # Earth's radius in kilometers
dlat = math.radians(lat2 - lat1)
dlon = math.radians(lon2 - lon1)
a = (math.sin(dlat/2) * math.sin(dlat/2) +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
math.sin(dlon/2) * math.sin(dlon/2))
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
return R * c
def find_nearest_traffic_point(self, traffic_points: List[Dict[str, Any]],
latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
"""Find the nearest traffic point to given coordinates"""
if not traffic_points:
return None
min_distance = float('inf')
nearest_point = None
for point in traffic_points:
point_lat = point.get('latitude')
point_lon = point.get('longitude')
if point_lat and point_lon:
distance = self.calculate_distance(latitude, longitude, point_lat, point_lon)
if distance < min_distance:
min_distance = distance
nearest_point = point
return nearest_point
def find_nearest_measurement_points(self, measurement_points: Dict[str, Dict[str, Any]],
latitude: float, longitude: float,
num_points: int = 3, max_distance_km: Optional[float] = 5.0) -> List[Tuple[str, Dict[str, Any], float]]:
"""Find nearest measurement points for historical data"""
distances = []
for point_id, point_data in measurement_points.items():
point_lat = point_data.get('latitude')
point_lon = point_data.get('longitude')
if point_lat and point_lon:
distance_km = self.calculate_distance(latitude, longitude, point_lat, point_lon)
distances.append((point_id, point_data, distance_km))
# Sort by distance and take nearest points
distances.sort(key=lambda x: x[2])
# Apply distance filter if specified
if max_distance_km is not None:
distances = [p for p in distances if p[2] <= max_distance_km]
nearest = distances[:num_points]
self.logger.info("Found nearest measurement points",
count=len(nearest),
nearest_distance_km=nearest[0][2] if nearest else None)
return nearest

View File

@@ -0,0 +1,478 @@
# ================================================================
# services/data/app/external/processors/madrid_processor.py
# ================================================================
"""
Data transformation and parsing for Madrid traffic data
Handles XML parsing, CSV processing, coordinate conversion, and data quality scoring
"""
import csv
import io
import math
import re
import xml.etree.ElementTree as ET
import zipfile
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional, Tuple
import structlog
import pyproj
from ..models.madrid_models import TrafficRecord, MeasurementPoint, CongestionLevel
class MadridTrafficDataProcessor:
"""Handles all data transformation and parsing for Madrid traffic data"""
def __init__(self):
self.logger = structlog.get_logger()
# UTM Zone 30N (Madrid's coordinate system)
self.utm_proj = pyproj.Proj(proj='utm', zone=30, ellps='WGS84', datum='WGS84')
self.wgs84_proj = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
def safe_int(self, value: str) -> int:
"""Safely convert string to int"""
try:
return int(float(value.replace(',', '.')))
except (ValueError, TypeError):
return 0
def _safe_float(self, value: str) -> float:
"""Safely convert string to float"""
try:
return float(value.replace(',', '.'))
except (ValueError, TypeError):
return 0.0
def clean_madrid_xml(self, xml_content: str) -> str:
"""Clean and prepare Madrid XML content for parsing"""
if not xml_content:
return ""
# Remove BOM and extra whitespace
cleaned = xml_content.strip()
if cleaned.startswith('\ufeff'):
cleaned = cleaned[1:]
# Fix common XML issues
cleaned = re.sub(r'&(?!amp;|lt;|gt;|quot;|apos;)', '&amp;', cleaned)
# Ensure proper encoding declaration
if not cleaned.startswith('<?xml'):
cleaned = '<?xml version="1.0" encoding="UTF-8"?>\n' + cleaned
return cleaned
def convert_utm_to_latlon(self, utm_x: str, utm_y: str) -> Tuple[Optional[float], Optional[float]]:
"""Convert UTM coordinates to latitude/longitude"""
try:
utm_x_float = float(utm_x.replace(',', '.'))
utm_y_float = float(utm_y.replace(',', '.'))
# Convert from UTM Zone 30N to WGS84
longitude, latitude = pyproj.transform(self.utm_proj, self.wgs84_proj, utm_x_float, utm_y_float)
# Validate coordinates are in Madrid area
if 40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5:
return latitude, longitude
else:
self.logger.debug("Coordinates outside Madrid bounds",
lat=latitude, lon=longitude, utm_x=utm_x, utm_y=utm_y)
return None, None
except Exception as e:
self.logger.debug("UTM conversion error",
utm_x=utm_x, utm_y=utm_y, error=str(e))
return None, None
def parse_traffic_xml(self, xml_content: str) -> List[Dict[str, Any]]:
"""Parse Madrid traffic XML data"""
traffic_points = []
try:
cleaned_xml = self.clean_madrid_xml(xml_content)
root = ET.fromstring(cleaned_xml)
self.logger.debug("Madrid XML structure", root_tag=root.tag, children_count=len(list(root)))
if root.tag == 'pms':
pm_elements = root.findall('pm')
self.logger.debug("Found PM elements", count=len(pm_elements))
for pm in pm_elements:
try:
traffic_point = self._extract_madrid_pm_element(pm)
if self._is_valid_traffic_point(traffic_point):
traffic_points.append(traffic_point)
# Log first few points for debugging
if len(traffic_points) <= 3:
self.logger.debug("Sample traffic point",
id=traffic_point['idelem'],
lat=traffic_point['latitude'],
lon=traffic_point['longitude'],
intensity=traffic_point.get('intensidad'))
except Exception as e:
self.logger.debug("Error parsing PM element", error=str(e))
continue
else:
self.logger.warning("Unexpected XML root tag", root_tag=root.tag)
self.logger.debug("Madrid traffic XML parsing completed", valid_points=len(traffic_points))
return traffic_points
except ET.ParseError as e:
self.logger.warning("Failed to parse Madrid XML", error=str(e))
return self._extract_traffic_data_regex(xml_content)
except Exception as e:
self.logger.error("Error in Madrid traffic XML parsing", error=str(e))
return []
def _extract_madrid_pm_element(self, pm_element) -> Dict[str, Any]:
"""Extract traffic data from Madrid <pm> element with coordinate conversion"""
try:
point_data = {}
utm_x = utm_y = None
# Extract all child elements
for child in pm_element:
tag, text = child.tag, child.text.strip() if child.text else ''
if tag == 'idelem':
point_data['idelem'] = text
elif tag == 'descripcion':
point_data['descripcion'] = text
elif tag == 'intensidad':
point_data['intensidad'] = self.safe_int(text)
elif tag == 'ocupacion':
point_data['ocupacion'] = self._safe_float(text)
elif tag == 'carga':
point_data['carga'] = self.safe_int(text)
elif tag == 'nivelServicio':
point_data['nivelServicio'] = self.safe_int(text)
elif tag == 'st_x': # UTM X coordinate
utm_x = text
point_data['utm_x'] = text
elif tag == 'st_y': # UTM Y coordinate
utm_y = text
point_data['utm_y'] = text
elif tag == 'error':
point_data['error'] = text
elif tag in ['subarea', 'accesoAsociado', 'intensidadSat']:
point_data[tag] = text
# Convert coordinates
if utm_x and utm_y:
latitude, longitude = self.convert_utm_to_latlon(utm_x, utm_y)
if latitude and longitude:
point_data.update({
'latitude': latitude,
'longitude': longitude,
'measurement_point_id': point_data.get('idelem'),
'measurement_point_name': point_data.get('descripcion'),
'timestamp': datetime.now(timezone.utc),
'source': 'madrid_opendata_xml'
})
return point_data
else:
self.logger.debug("Invalid coordinates after conversion",
idelem=point_data.get('idelem'), utm_x=utm_x, utm_y=utm_y)
return {}
else:
self.logger.debug("Missing UTM coordinates", idelem=point_data.get('idelem'))
return {}
except Exception as e:
self.logger.debug("Error extracting PM element", error=str(e))
return {}
def _is_valid_traffic_point(self, traffic_point: Dict[str, Any]) -> bool:
"""Validate traffic point data"""
required_fields = ['idelem', 'latitude', 'longitude']
return all(field in traffic_point and traffic_point[field] for field in required_fields)
def _extract_traffic_data_regex(self, xml_content: str) -> List[Dict[str, Any]]:
"""Fallback regex-based extraction if XML parsing fails"""
traffic_points = []
try:
# Pattern to match PM elements
pm_pattern = r'<pm>(.*?)</pm>'
pm_matches = re.findall(pm_pattern, xml_content, re.DOTALL)
for pm_content in pm_matches:
traffic_point = {}
# Extract key fields
patterns = {
'idelem': r'<idelem>(.*?)</idelem>',
'descripcion': r'<descripcion>(.*?)</descripcion>',
'intensidad': r'<intensidad>(.*?)</intensidad>',
'ocupacion': r'<ocupacion>(.*?)</ocupacion>',
'st_x': r'<st_x>(.*?)</st_x>',
'st_y': r'<st_y>(.*?)</st_y>'
}
for field, pattern in patterns.items():
match = re.search(pattern, pm_content)
if match:
traffic_point[field] = match.group(1).strip()
# Convert coordinates
if 'st_x' in traffic_point and 'st_y' in traffic_point:
latitude, longitude = self.convert_utm_to_latlon(
traffic_point['st_x'], traffic_point['st_y']
)
if latitude and longitude:
traffic_point.update({
'latitude': latitude,
'longitude': longitude,
'intensidad': self.safe_int(traffic_point.get('intensidad', '0')),
'ocupacion': self._safe_float(traffic_point.get('ocupacion', '0')),
'measurement_point_id': traffic_point.get('idelem'),
'measurement_point_name': traffic_point.get('descripcion'),
'timestamp': datetime.now(timezone.utc),
'source': 'madrid_opendata_xml_regex'
})
traffic_points.append(traffic_point)
self.logger.debug("Regex extraction completed", points=len(traffic_points))
return traffic_points
except Exception as e:
self.logger.error("Error in regex extraction", error=str(e))
return []
def parse_measurement_points_csv(self, csv_content: str) -> Dict[str, Dict[str, Any]]:
"""Parse measurement points CSV into lookup dictionary"""
measurement_points = {}
try:
# Parse CSV with semicolon delimiter
csv_reader = csv.DictReader(io.StringIO(csv_content), delimiter=';')
processed_count = 0
for row in csv_reader:
try:
# Extract point ID and coordinates
point_id = row.get('id', '').strip()
if not point_id:
continue
processed_count += 1
# Try different coordinate field names
lat_str = ''
lon_str = ''
# Common coordinate field patterns
lat_fields = ['lat', 'latitude', 'latitud', 'y', 'utm_y']
lon_fields = ['lon', 'lng', 'longitude', 'longitud', 'x', 'utm_x']
for field in lat_fields:
if field in row and row[field].strip():
lat_str = row[field].strip()
break
for field in lon_fields:
if field in row and row[field].strip():
lon_str = row[field].strip()
break
if lat_str and lon_str:
try:
# Try direct lat/lon first
latitude = self._safe_float(lat_str)
longitude = self._safe_float(lon_str)
# If values look like UTM coordinates, convert them
if latitude > 1000 or longitude > 1000:
latitude, longitude = self.convert_utm_to_latlon(lon_str, lat_str)
if not latitude or not longitude:
continue
# Validate Madrid area
if not (40.3 <= latitude <= 40.6 and -3.8 <= longitude <= -3.5):
continue
measurement_points[point_id] = {
'id': point_id,
'latitude': latitude,
'longitude': longitude,
'name': row.get('nombre', row.get('descripcion', f"Point {point_id}")),
'type': row.get('tipo', 'traffic'),
'raw_data': dict(row) # Keep original data
}
except Exception as e:
self.logger.debug("Error processing point coordinates",
point_id=point_id, error=str(e))
continue
except Exception as e:
self.logger.debug("Error processing CSV row", error=str(e))
continue
self.logger.info("Parsed measurement points registry",
total_points=len(measurement_points))
return measurement_points
except Exception as e:
self.logger.error("Error parsing measurement points CSV", error=str(e))
return {}
def calculate_data_quality_score(self, row: Dict[str, str]) -> float:
"""Calculate data quality score for a traffic record"""
try:
score = 1.0
# Check for missing or invalid values
intensidad = row.get('intensidad', '').strip()
if not intensidad or intensidad in ['N', '', '0']:
score *= 0.7
ocupacion = row.get('ocupacion', '').strip()
if not ocupacion or ocupacion in ['N', '', '0']:
score *= 0.8
error_status = row.get('error', '').strip()
if error_status and error_status != 'N':
score *= 0.6
# Check for reasonable value ranges
try:
intensidad_val = self.safe_int(intensidad)
if intensidad_val < 0 or intensidad_val > 5000: # Unrealistic traffic volume
score *= 0.7
ocupacion_val = self.safe_int(ocupacion)
if ocupacion_val < 0 or ocupacion_val > 100: # Invalid percentage
score *= 0.5
except:
score *= 0.6
return max(0.1, score) # Minimum quality score
except Exception as e:
self.logger.debug("Error calculating quality score", error=str(e))
return 0.5 # Default medium quality
async def process_csv_content_chunked(self, text_content: str, csv_filename: str,
nearest_ids: set, nearest_points: list) -> list:
"""Process CSV content in chunks to prevent memory issues"""
import csv
import io
import gc
try:
csv_reader = csv.DictReader(io.StringIO(text_content), delimiter=';')
chunk_size = 10000
chunk_records = []
all_records = []
processed_count = 0
total_rows_seen = 0
for row in csv_reader:
total_rows_seen += 1
measurement_point_id = row.get('id', '').strip()
if measurement_point_id not in nearest_ids:
continue
try:
record_data = await self.parse_historical_csv_row(row, nearest_points)
if record_data:
chunk_records.append(record_data)
processed_count += 1
if len(chunk_records) >= chunk_size:
all_records.extend(chunk_records)
chunk_records = []
gc.collect()
except Exception as e:
if processed_count < 5:
self.logger.error("Row parsing exception",
row_num=total_rows_seen,
measurement_point_id=measurement_point_id,
error=str(e))
continue
# Process remaining records
if chunk_records:
all_records.extend(chunk_records)
chunk_records = []
gc.collect()
self.logger.info("Processed CSV file",
filename=csv_filename,
total_rows_read=total_rows_seen,
processed_records=processed_count)
return all_records
except Exception as e:
self.logger.error("Error processing CSV content",
filename=csv_filename, error=str(e))
return []
async def parse_historical_csv_row(self, row: dict, nearest_points: list) -> dict:
"""Parse a single row from Madrid's historical traffic CSV"""
try:
# Extract date
fecha_str = row.get('fecha', '').strip()
if not fecha_str:
return None
try:
from datetime import datetime, timezone
date_obj = datetime.strptime(fecha_str, '%Y-%m-%d %H:%M:%S')
date_obj = date_obj.replace(tzinfo=timezone.utc)
except Exception:
return None
measurement_point_id = row.get('id', '').strip()
# Find point data
point_match = next((p for p in nearest_points if p[0] == measurement_point_id), None)
if not point_match:
return None
point_data = point_match[1]
distance_km = point_match[2]
# Extract traffic data
intensidad = self.safe_int(row.get('intensidad', '0'))
ocupacion = self.safe_int(row.get('ocupacion', '0'))
carga = self.safe_int(row.get('carga', '0'))
vmed = self.safe_int(row.get('vmed', '0'))
# Build basic result (business logic will be applied elsewhere)
result = {
'date': date_obj,
'measurement_point_id': measurement_point_id,
'point_data': point_data,
'distance_km': distance_km,
'traffic_data': {
'intensidad': intensidad,
'ocupacion': ocupacion,
'carga': carga,
'vmed': vmed
},
'data_quality_score': self.calculate_data_quality_score(row),
'raw_row': row
}
return result
except Exception as e:
self.logger.debug("Error parsing historical CSV row", error=str(e))
return None