Improve the traffic fetching system
This commit is contained in:
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
import asyncio
|
||||
import logging
|
||||
import structlog
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import timezone
|
||||
import pandas as pd
|
||||
@@ -24,7 +24,7 @@ from app.services.messaging import (
|
||||
publish_job_failed
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@dataclass
|
||||
class TrainingDataSet:
|
||||
@@ -39,15 +39,14 @@ class TrainingDataOrchestrator:
|
||||
"""
|
||||
Enhanced orchestrator for data collection from multiple sources.
|
||||
Ensures date alignment, handles data source constraints, and prepares data for ML training.
|
||||
Uses the new abstracted traffic service layer for multi-city support.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
madrid_client=None,
|
||||
weather_client=None,
|
||||
date_alignment_service: DateAlignmentService = None):
|
||||
self.data_client = DataClient()
|
||||
self.date_alignment_service = date_alignment_service or DateAlignmentService()
|
||||
self.max_concurrent_requests = 3
|
||||
self.max_concurrent_requests = 5 # Increased for better performance
|
||||
|
||||
async def prepare_training_data(
|
||||
self,
|
||||
@@ -281,11 +280,11 @@ class TrainingDataOrchestrator:
|
||||
)
|
||||
tasks.append(("weather", weather_task))
|
||||
|
||||
# Traffic data collection
|
||||
# Enhanced Traffic data collection (supports multiple cities)
|
||||
if DataSourceType.MADRID_TRAFFIC in aligned_range.available_sources:
|
||||
logger.info(f"🚛 Traffic data source available, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
|
||||
logger.info(f"🚛 Traffic data source available for multiple cities, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
|
||||
traffic_task = asyncio.create_task(
|
||||
self._collect_traffic_data_with_timeout(lat, lon, aligned_range, tenant_id)
|
||||
self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
|
||||
)
|
||||
tasks.append(("traffic", traffic_task))
|
||||
else:
|
||||
@@ -353,28 +352,31 @@ class TrainingDataOrchestrator:
|
||||
logger.warning(f"Weather data collection failed: {e}, using synthetic data")
|
||||
return self._generate_synthetic_weather_data(aligned_range)
|
||||
|
||||
async def _collect_traffic_data_with_timeout(
|
||||
async def _collect_traffic_data_with_timeout_enhanced(
|
||||
self,
|
||||
lat: float,
|
||||
lon: float,
|
||||
aligned_range: AlignedDateRange,
|
||||
tenant_id: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Collect traffic data with enhanced storage and retrieval for re-training"""
|
||||
"""
|
||||
Enhanced traffic data collection with multi-city support and improved storage
|
||||
Uses the new abstracted traffic service layer
|
||||
"""
|
||||
try:
|
||||
|
||||
# Double-check Madrid constraint before making request
|
||||
# Double-check constraints before making request
|
||||
constraint_violated = self.date_alignment_service.check_madrid_current_month_constraint(aligned_range.end)
|
||||
if constraint_violated:
|
||||
logger.warning(f"🚫 Madrid current month constraint violation: end_date={aligned_range.end}, no traffic data available")
|
||||
logger.warning(f"🚫 Current month constraint violation: end_date={aligned_range.end}, no traffic data available")
|
||||
return []
|
||||
else:
|
||||
logger.info(f"✅ Madrid constraint passed: end_date={aligned_range.end}, proceeding with traffic data request")
|
||||
logger.info(f"✅ Date constraints passed: end_date={aligned_range.end}, proceeding with traffic data request")
|
||||
|
||||
start_date_str = aligned_range.start.isoformat()
|
||||
end_date_str = aligned_range.end.isoformat()
|
||||
|
||||
# Fetch traffic data - this will automatically store it for future re-training
|
||||
# Enhanced: Fetch traffic data using new abstracted service
|
||||
# This automatically detects the appropriate city and uses the right client
|
||||
traffic_data = await self.data_client.fetch_traffic_data(
|
||||
tenant_id=tenant_id,
|
||||
start_date=start_date_str,
|
||||
@@ -382,39 +384,82 @@ class TrainingDataOrchestrator:
|
||||
latitude=lat,
|
||||
longitude=lon)
|
||||
|
||||
# Validate traffic data
|
||||
if self._validate_traffic_data(traffic_data):
|
||||
logger.info(f"Collected and stored {len(traffic_data)} valid traffic records for re-training")
|
||||
# Enhanced validation including pedestrian inference data
|
||||
if self._validate_traffic_data_enhanced(traffic_data):
|
||||
logger.info(f"Collected and stored {len(traffic_data)} valid enhanced traffic records for re-training")
|
||||
|
||||
# Log storage success for audit purposes
|
||||
self._log_traffic_data_storage(lat, lon, aligned_range, len(traffic_data))
|
||||
# Log storage success with enhanced metadata
|
||||
self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, len(traffic_data), traffic_data)
|
||||
|
||||
return traffic_data
|
||||
else:
|
||||
logger.warning("Invalid traffic data received")
|
||||
logger.warning("Invalid enhanced traffic data received")
|
||||
return []
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"Traffic data collection timed out")
|
||||
logger.warning(f"Enhanced traffic data collection timed out")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.warning(f"Traffic data collection failed: {e}")
|
||||
logger.warning(f"Enhanced traffic data collection failed: {e}")
|
||||
return []
|
||||
|
||||
# Keep original method for backwards compatibility
|
||||
async def _collect_traffic_data_with_timeout(
|
||||
self,
|
||||
lat: float,
|
||||
lon: float,
|
||||
aligned_range: AlignedDateRange,
|
||||
tenant_id: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Legacy traffic data collection method - redirects to enhanced version"""
|
||||
return await self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
|
||||
|
||||
def _log_enhanced_traffic_data_storage(self,
|
||||
lat: float,
|
||||
lon: float,
|
||||
aligned_range: AlignedDateRange,
|
||||
record_count: int,
|
||||
traffic_data: List[Dict[str, Any]]):
|
||||
"""Enhanced logging for traffic data storage with detailed metadata"""
|
||||
# Analyze the stored data for additional insights
|
||||
cities_detected = set()
|
||||
has_pedestrian_data = 0
|
||||
data_sources = set()
|
||||
districts_covered = set()
|
||||
|
||||
for record in traffic_data:
|
||||
if 'city' in record and record['city']:
|
||||
cities_detected.add(record['city'])
|
||||
if 'pedestrian_count' in record and record['pedestrian_count'] is not None:
|
||||
has_pedestrian_data += 1
|
||||
if 'source' in record and record['source']:
|
||||
data_sources.add(record['source'])
|
||||
if 'district' in record and record['district']:
|
||||
districts_covered.add(record['district'])
|
||||
|
||||
logger.info(
|
||||
"Enhanced traffic data stored for re-training",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
|
||||
records_stored=record_count,
|
||||
cities_detected=list(cities_detected),
|
||||
pedestrian_inference_coverage=f"{has_pedestrian_data}/{record_count}",
|
||||
data_sources=list(data_sources),
|
||||
districts_covered=list(districts_covered),
|
||||
storage_timestamp=datetime.now().isoformat(),
|
||||
purpose="enhanced_model_training_and_retraining",
|
||||
architecture_version="2.0_abstracted"
|
||||
)
|
||||
|
||||
def _log_traffic_data_storage(self,
|
||||
lat: float,
|
||||
lon: float,
|
||||
aligned_range: AlignedDateRange,
|
||||
record_count: int):
|
||||
"""Log traffic data storage for audit and re-training tracking"""
|
||||
logger.info(
|
||||
"Traffic data stored for re-training",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
|
||||
records_stored=record_count,
|
||||
storage_timestamp=datetime.now().isoformat(),
|
||||
purpose="model_training_and_retraining"
|
||||
)
|
||||
"""Legacy logging method - redirects to enhanced version"""
|
||||
# Create minimal traffic data structure for enhanced logging
|
||||
minimal_traffic_data = [{"city": "madrid", "source": "legacy"}] * min(record_count, 1)
|
||||
self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, record_count, minimal_traffic_data)
|
||||
|
||||
async def retrieve_stored_traffic_for_retraining(
|
||||
self,
|
||||
@@ -491,32 +536,73 @@ class TrainingDataOrchestrator:
|
||||
|
||||
return is_valid
|
||||
|
||||
def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
|
||||
"""Validate traffic data quality"""
|
||||
def _validate_traffic_data_enhanced(self, traffic_data: List[Dict[str, Any]]) -> bool:
|
||||
"""Enhanced validation for traffic data including pedestrian inference and city-specific fields"""
|
||||
if not traffic_data:
|
||||
return False
|
||||
|
||||
required_fields = ['date']
|
||||
traffic_fields = ['traffic_volume', 'traffic_intensity', 'intensidad', 'trafico']
|
||||
enhanced_fields = ['pedestrian_count', 'congestion_level', 'source']
|
||||
city_specific_fields = ['city', 'measurement_point_id', 'district']
|
||||
|
||||
valid_records = 0
|
||||
enhanced_records = 0
|
||||
city_aware_records = 0
|
||||
|
||||
for record in traffic_data:
|
||||
# Check required fields
|
||||
if not all(field in record for field in required_fields):
|
||||
continue
|
||||
record_score = 0
|
||||
|
||||
# Check at least one traffic field exists
|
||||
# Check required fields
|
||||
if all(field in record and record[field] is not None for field in required_fields):
|
||||
record_score += 1
|
||||
|
||||
# Check traffic data fields
|
||||
if any(field in record and record[field] is not None for field in traffic_fields):
|
||||
record_score += 1
|
||||
|
||||
# Check enhanced fields (pedestrian inference, etc.)
|
||||
enhanced_count = sum(1 for field in enhanced_fields
|
||||
if field in record and record[field] is not None)
|
||||
if enhanced_count >= 2: # At least 2 enhanced fields
|
||||
enhanced_records += 1
|
||||
record_score += 1
|
||||
|
||||
# Check city-specific awareness
|
||||
city_count = sum(1 for field in city_specific_fields
|
||||
if field in record and record[field] is not None)
|
||||
if city_count >= 1: # At least some city awareness
|
||||
city_aware_records += 1
|
||||
|
||||
# Record is valid if it has basic requirements
|
||||
if record_score >= 2:
|
||||
valid_records += 1
|
||||
|
||||
# Consider valid if at least 30% of records are valid (traffic data is often sparse)
|
||||
total_records = len(traffic_data)
|
||||
validity_threshold = 0.3
|
||||
is_valid = (valid_records / len(traffic_data)) >= validity_threshold
|
||||
enhancement_threshold = 0.2 # Lower threshold for enhanced features
|
||||
|
||||
if not is_valid:
|
||||
logger.warning(f"Traffic data validation failed: {valid_records}/{len(traffic_data)} valid records")
|
||||
basic_validity = (valid_records / total_records) >= validity_threshold
|
||||
has_enhancements = (enhanced_records / total_records) >= enhancement_threshold
|
||||
has_city_awareness = (city_aware_records / total_records) >= enhancement_threshold
|
||||
|
||||
return is_valid
|
||||
logger.info("Enhanced traffic data validation results",
|
||||
total_records=total_records,
|
||||
valid_records=valid_records,
|
||||
enhanced_records=enhanced_records,
|
||||
city_aware_records=city_aware_records,
|
||||
basic_validity=basic_validity,
|
||||
has_enhancements=has_enhancements,
|
||||
has_city_awareness=has_city_awareness)
|
||||
|
||||
if not basic_validity:
|
||||
logger.warning(f"Traffic data basic validation failed: {valid_records}/{total_records} valid records")
|
||||
|
||||
return basic_validity
|
||||
|
||||
def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
|
||||
"""Legacy validation method - redirects to enhanced version"""
|
||||
return self._validate_traffic_data_enhanced(traffic_data)
|
||||
|
||||
def _validate_data_sources(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user