Improve the traffic fetching system

This commit is contained in:
Urtzi Alfaro
2025-08-10 17:31:38 +02:00
parent 312fdc8ef3
commit 3c2acc934a
16 changed files with 3866 additions and 1981 deletions

View File

@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
import asyncio
import logging
import structlog
from concurrent.futures import ThreadPoolExecutor
from datetime import timezone
import pandas as pd
@@ -24,7 +24,7 @@ from app.services.messaging import (
publish_job_failed
)
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
@dataclass
class TrainingDataSet:
@@ -39,15 +39,14 @@ class TrainingDataOrchestrator:
"""
Enhanced orchestrator for data collection from multiple sources.
Ensures date alignment, handles data source constraints, and prepares data for ML training.
Uses the new abstracted traffic service layer for multi-city support.
"""
def __init__(self,
madrid_client=None,
weather_client=None,
date_alignment_service: DateAlignmentService = None):
self.data_client = DataClient()
self.date_alignment_service = date_alignment_service or DateAlignmentService()
self.max_concurrent_requests = 3
self.max_concurrent_requests = 5 # Increased for better performance
async def prepare_training_data(
self,
@@ -281,11 +280,11 @@ class TrainingDataOrchestrator:
)
tasks.append(("weather", weather_task))
# Traffic data collection
# Enhanced Traffic data collection (supports multiple cities)
if DataSourceType.MADRID_TRAFFIC in aligned_range.available_sources:
logger.info(f"🚛 Traffic data source available, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
logger.info(f"🚛 Traffic data source available for multiple cities, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
traffic_task = asyncio.create_task(
self._collect_traffic_data_with_timeout(lat, lon, aligned_range, tenant_id)
self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
)
tasks.append(("traffic", traffic_task))
else:
@@ -353,28 +352,31 @@ class TrainingDataOrchestrator:
logger.warning(f"Weather data collection failed: {e}, using synthetic data")
return self._generate_synthetic_weather_data(aligned_range)
async def _collect_traffic_data_with_timeout(
async def _collect_traffic_data_with_timeout_enhanced(
self,
lat: float,
lon: float,
aligned_range: AlignedDateRange,
tenant_id: str
) -> List[Dict[str, Any]]:
"""Collect traffic data with enhanced storage and retrieval for re-training"""
"""
Enhanced traffic data collection with multi-city support and improved storage
Uses the new abstracted traffic service layer
"""
try:
# Double-check Madrid constraint before making request
# Double-check constraints before making request
constraint_violated = self.date_alignment_service.check_madrid_current_month_constraint(aligned_range.end)
if constraint_violated:
logger.warning(f"🚫 Madrid current month constraint violation: end_date={aligned_range.end}, no traffic data available")
logger.warning(f"🚫 Current month constraint violation: end_date={aligned_range.end}, no traffic data available")
return []
else:
logger.info(f"Madrid constraint passed: end_date={aligned_range.end}, proceeding with traffic data request")
logger.info(f"Date constraints passed: end_date={aligned_range.end}, proceeding with traffic data request")
start_date_str = aligned_range.start.isoformat()
end_date_str = aligned_range.end.isoformat()
# Fetch traffic data - this will automatically store it for future re-training
# Enhanced: Fetch traffic data using new abstracted service
# This automatically detects the appropriate city and uses the right client
traffic_data = await self.data_client.fetch_traffic_data(
tenant_id=tenant_id,
start_date=start_date_str,
@@ -382,39 +384,82 @@ class TrainingDataOrchestrator:
latitude=lat,
longitude=lon)
# Validate traffic data
if self._validate_traffic_data(traffic_data):
logger.info(f"Collected and stored {len(traffic_data)} valid traffic records for re-training")
# Enhanced validation including pedestrian inference data
if self._validate_traffic_data_enhanced(traffic_data):
logger.info(f"Collected and stored {len(traffic_data)} valid enhanced traffic records for re-training")
# Log storage success for audit purposes
self._log_traffic_data_storage(lat, lon, aligned_range, len(traffic_data))
# Log storage success with enhanced metadata
self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, len(traffic_data), traffic_data)
return traffic_data
else:
logger.warning("Invalid traffic data received")
logger.warning("Invalid enhanced traffic data received")
return []
except asyncio.TimeoutError:
logger.warning(f"Traffic data collection timed out")
logger.warning(f"Enhanced traffic data collection timed out")
return []
except Exception as e:
logger.warning(f"Traffic data collection failed: {e}")
logger.warning(f"Enhanced traffic data collection failed: {e}")
return []
# Keep original method for backwards compatibility
async def _collect_traffic_data_with_timeout(
self,
lat: float,
lon: float,
aligned_range: AlignedDateRange,
tenant_id: str
) -> List[Dict[str, Any]]:
"""Legacy traffic data collection method - redirects to enhanced version"""
return await self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
def _log_enhanced_traffic_data_storage(self,
lat: float,
lon: float,
aligned_range: AlignedDateRange,
record_count: int,
traffic_data: List[Dict[str, Any]]):
"""Enhanced logging for traffic data storage with detailed metadata"""
# Analyze the stored data for additional insights
cities_detected = set()
has_pedestrian_data = 0
data_sources = set()
districts_covered = set()
for record in traffic_data:
if 'city' in record and record['city']:
cities_detected.add(record['city'])
if 'pedestrian_count' in record and record['pedestrian_count'] is not None:
has_pedestrian_data += 1
if 'source' in record and record['source']:
data_sources.add(record['source'])
if 'district' in record and record['district']:
districts_covered.add(record['district'])
logger.info(
"Enhanced traffic data stored for re-training",
location=f"{lat:.4f},{lon:.4f}",
date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
records_stored=record_count,
cities_detected=list(cities_detected),
pedestrian_inference_coverage=f"{has_pedestrian_data}/{record_count}",
data_sources=list(data_sources),
districts_covered=list(districts_covered),
storage_timestamp=datetime.now().isoformat(),
purpose="enhanced_model_training_and_retraining",
architecture_version="2.0_abstracted"
)
def _log_traffic_data_storage(self,
lat: float,
lon: float,
aligned_range: AlignedDateRange,
record_count: int):
"""Log traffic data storage for audit and re-training tracking"""
logger.info(
"Traffic data stored for re-training",
location=f"{lat:.4f},{lon:.4f}",
date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
records_stored=record_count,
storage_timestamp=datetime.now().isoformat(),
purpose="model_training_and_retraining"
)
"""Legacy logging method - redirects to enhanced version"""
# Create minimal traffic data structure for enhanced logging
minimal_traffic_data = [{"city": "madrid", "source": "legacy"}] * min(record_count, 1)
self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, record_count, minimal_traffic_data)
async def retrieve_stored_traffic_for_retraining(
self,
@@ -491,32 +536,73 @@ class TrainingDataOrchestrator:
return is_valid
def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
"""Validate traffic data quality"""
def _validate_traffic_data_enhanced(self, traffic_data: List[Dict[str, Any]]) -> bool:
"""Enhanced validation for traffic data including pedestrian inference and city-specific fields"""
if not traffic_data:
return False
required_fields = ['date']
traffic_fields = ['traffic_volume', 'traffic_intensity', 'intensidad', 'trafico']
enhanced_fields = ['pedestrian_count', 'congestion_level', 'source']
city_specific_fields = ['city', 'measurement_point_id', 'district']
valid_records = 0
enhanced_records = 0
city_aware_records = 0
for record in traffic_data:
# Check required fields
if not all(field in record for field in required_fields):
continue
record_score = 0
# Check at least one traffic field exists
# Check required fields
if all(field in record and record[field] is not None for field in required_fields):
record_score += 1
# Check traffic data fields
if any(field in record and record[field] is not None for field in traffic_fields):
record_score += 1
# Check enhanced fields (pedestrian inference, etc.)
enhanced_count = sum(1 for field in enhanced_fields
if field in record and record[field] is not None)
if enhanced_count >= 2: # At least 2 enhanced fields
enhanced_records += 1
record_score += 1
# Check city-specific awareness
city_count = sum(1 for field in city_specific_fields
if field in record and record[field] is not None)
if city_count >= 1: # At least some city awareness
city_aware_records += 1
# Record is valid if it has basic requirements
if record_score >= 2:
valid_records += 1
# Consider valid if at least 30% of records are valid (traffic data is often sparse)
total_records = len(traffic_data)
validity_threshold = 0.3
is_valid = (valid_records / len(traffic_data)) >= validity_threshold
enhancement_threshold = 0.2 # Lower threshold for enhanced features
if not is_valid:
logger.warning(f"Traffic data validation failed: {valid_records}/{len(traffic_data)} valid records")
basic_validity = (valid_records / total_records) >= validity_threshold
has_enhancements = (enhanced_records / total_records) >= enhancement_threshold
has_city_awareness = (city_aware_records / total_records) >= enhancement_threshold
return is_valid
logger.info("Enhanced traffic data validation results",
total_records=total_records,
valid_records=valid_records,
enhanced_records=enhanced_records,
city_aware_records=city_aware_records,
basic_validity=basic_validity,
has_enhancements=has_enhancements,
has_city_awareness=has_city_awareness)
if not basic_validity:
logger.warning(f"Traffic data basic validation failed: {valid_records}/{total_records} valid records")
return basic_validity
def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
"""Legacy validation method - redirects to enhanced version"""
return self._validate_traffic_data_enhanced(traffic_data)
def _validate_data_sources(
self,