Improve the traffic fetching system

2025-08-10 17:31:38 +02:00
parent 312fdc8ef3
commit 3c2acc934a
16 changed files with 3866 additions and 1981 deletions
--- a/services/data/app/core/performance.py
+++ b/services/data/app/core/performance.py
@@ -0,0 +1,312 @@
 # ================================================================
 # services/data/app/core/performance.py
 # ================================================================
 """
 Performance optimization utilities for async operations
 """
 import asyncio
 import functools
 from typing import Any, Callable, Dict, Optional, TypeVar
 from datetime import datetime, timedelta, timezone
 import hashlib
 import json
 import structlog
 logger = structlog.get_logger()
 T = TypeVar('T')
 class AsyncCache:
    """Simple in-memory async cache with TTL"""
    def __init__(self, default_ttl: int = 300):
        self.cache: Dict[str, Dict[str, Any]] = {}
        self.default_ttl = default_ttl
    def _generate_key(self, *args, **kwargs) -> str:
        """Generate cache key from arguments"""
        key_data = {
            'args': args,
            'kwargs': sorted(kwargs.items())
        }
        key_string = json.dumps(key_data, sort_keys=True, default=str)
        return hashlib.md5(key_string.encode()).hexdigest()
    def _is_expired(self, entry: Dict[str, Any]) -> bool:
        """Check if cache entry is expired"""
        expires_at = entry.get('expires_at')
        if not expires_at:
            return True
        return datetime.now(timezone.utc) > expires_at
    async def get(self, key: str) -> Optional[Any]:
        """Get value from cache"""
        if key in self.cache:
            entry = self.cache[key]
            if not self._is_expired(entry):
                logger.debug("Cache hit", cache_key=key)
                return entry['value']
            else:
                # Clean up expired entry
                del self.cache[key]
                logger.debug("Cache expired", cache_key=key)
        logger.debug("Cache miss", cache_key=key)
        return None
    async def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
        """Set value in cache"""
        ttl = ttl or self.default_ttl
        expires_at = datetime.now(timezone.utc) + timedelta(seconds=ttl)
        self.cache[key] = {
            'value': value,
            'expires_at': expires_at,
            'created_at': datetime.now(timezone.utc)
        }
        logger.debug("Cache set", cache_key=key, ttl=ttl)
    async def clear(self) -> None:
        """Clear all cache entries"""
        self.cache.clear()
        logger.info("Cache cleared")
    async def cleanup_expired(self) -> int:
        """Clean up expired entries"""
        expired_keys = [
            key for key, entry in self.cache.items()
            if self._is_expired(entry)
        ]
        for key in expired_keys:
            del self.cache[key]
        if expired_keys:
            logger.info("Cleaned up expired cache entries", count=len(expired_keys))
        return len(expired_keys)
 def async_cache(ttl: int = 300, cache_instance: Optional[AsyncCache] = None):
    """Decorator for caching async function results"""
    def decorator(func: Callable[..., T]) -> Callable[..., T]:
        _cache = cache_instance or AsyncCache(ttl)
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            # Generate cache key
            cache_key = _cache._generate_key(func.__name__, *args, **kwargs)
            # Try to get from cache
            cached_result = await _cache.get(cache_key)
            if cached_result is not None:
                return cached_result
            # Execute function and cache result
            result = await func(*args, **kwargs)
            await _cache.set(cache_key, result, ttl)
            return result
        # Add cache management methods
        wrapper.cache_clear = _cache.clear
        wrapper.cache_cleanup = _cache.cleanup_expired
        return wrapper
    return decorator
 class ConnectionPool:
    """Simple connection pool for HTTP clients"""
    def __init__(self, max_connections: int = 10):
        self.max_connections = max_connections
        self.semaphore = asyncio.Semaphore(max_connections)
        self._active_connections = 0
    async def acquire(self):
        """Acquire a connection slot"""
        await self.semaphore.acquire()
        self._active_connections += 1
        logger.debug("Connection acquired", active=self._active_connections, max=self.max_connections)
    async def release(self):
        """Release a connection slot"""
        self.semaphore.release()
        self._active_connections = max(0, self._active_connections - 1)
        logger.debug("Connection released", active=self._active_connections, max=self.max_connections)
    async def __aenter__(self):
        await self.acquire()
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.release()
 def rate_limit(calls: int, period: int):
    """Rate limiting decorator"""
    def decorator(func: Callable[..., T]) -> Callable[..., T]:
        call_times = []
        lock = asyncio.Lock()
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            async with lock:
                now = datetime.now(timezone.utc)
                # Remove old call times
                cutoff = now - timedelta(seconds=period)
                call_times[:] = [t for t in call_times if t > cutoff]
                # Check rate limit
                if len(call_times) >= calls:
                    sleep_time = (call_times[0] + timedelta(seconds=period) - now).total_seconds()
                    if sleep_time > 0:
                        logger.warning("Rate limit reached, sleeping", sleep_time=sleep_time)
                        await asyncio.sleep(sleep_time)
                # Record this call
                call_times.append(now)
            return await func(*args, **kwargs)
        return wrapper
    return decorator
 async def batch_process(
    items: list,
    process_func: Callable,
    batch_size: int = 10,
    max_concurrency: int = 5
 ) -> list:
    """Process items in batches with controlled concurrency"""
    results = []
    semaphore = asyncio.Semaphore(max_concurrency)
    async def process_batch(batch):
        async with semaphore:
            return await process_func(batch)
    # Create batches
    batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
    logger.info("Processing items in batches", 
               total_items=len(items), 
               batches=len(batches), 
               batch_size=batch_size,
               max_concurrency=max_concurrency)
    # Process batches concurrently
    batch_results = await asyncio.gather(
        *[process_batch(batch) for batch in batches],
        return_exceptions=True
    )
    # Flatten results
    for batch_result in batch_results:
        if isinstance(batch_result, Exception):
            logger.error("Batch processing error", error=str(batch_result))
            continue
        if isinstance(batch_result, list):
            results.extend(batch_result)
        else:
            results.append(batch_result)
    logger.info("Batch processing completed", 
               processed_items=len(results), 
               total_batches=len(batches))
    return results
 class PerformanceMonitor:
    """Simple performance monitoring for async functions"""
    def __init__(self):
        self.metrics = {}
    def record_execution(self, func_name: str, duration: float, success: bool = True):
        """Record function execution metrics"""
        if func_name not in self.metrics:
            self.metrics[func_name] = {
                'call_count': 0,
                'success_count': 0,
                'error_count': 0,
                'total_duration': 0.0,
                'min_duration': float('inf'),
                'max_duration': 0.0
            }
        metric = self.metrics[func_name]
        metric['call_count'] += 1
        metric['total_duration'] += duration
        metric['min_duration'] = min(metric['min_duration'], duration)
        metric['max_duration'] = max(metric['max_duration'], duration)
        if success:
            metric['success_count'] += 1
        else:
            metric['error_count'] += 1
    def get_metrics(self, func_name: str = None) -> dict:
        """Get performance metrics"""
        if func_name:
            metric = self.metrics.get(func_name, {})
            if metric and metric['call_count'] > 0:
                metric['avg_duration'] = metric['total_duration'] / metric['call_count']
                metric['success_rate'] = metric['success_count'] / metric['call_count']
            return metric
        return self.metrics
 def monitor_performance(monitor: Optional[PerformanceMonitor] = None):
    """Decorator to monitor function performance"""
    def decorator(func: Callable[..., T]) -> Callable[..., T]:
        _monitor = monitor or PerformanceMonitor()
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            start_time = datetime.now(timezone.utc)
            success = True
            try:
                result = await func(*args, **kwargs)
                return result
            except Exception as e:
                success = False
                raise
            finally:
                end_time = datetime.now(timezone.utc)
                duration = (end_time - start_time).total_seconds()
                _monitor.record_execution(func.__name__, duration, success)
                logger.debug("Function performance", 
                           function=func.__name__, 
                           duration=duration, 
                           success=success)
        # Add metrics access
        wrapper.get_metrics = lambda: _monitor.get_metrics(func.__name__)
        return wrapper
    return decorator
 # Global instances
 global_cache = AsyncCache(default_ttl=300)
 global_connection_pool = ConnectionPool(max_connections=20)
 global_performance_monitor = PerformanceMonitor()
--- a/services/data/app/external/apis/init.py
+++ b/services/data/app/external/apis/init.py
@@ -0,0 +1,10 @@
 # ================================================================
 # services/data/app/external/apis/__init__.py
 # ================================================================
 """
 External API clients module - Scalable architecture for multiple cities
 """
 from .traffic import TrafficAPIClientFactory
 __all__ = ["TrafficAPIClientFactory"]
--- a/services/data/app/external/apis/madrid_traffic_client.py
+++ b/services/data/app/external/apis/madrid_traffic_client.py
--- a/services/data/app/external/apis/traffic.py
+++ b/services/data/app/external/apis/traffic.py
@@ -0,0 +1,257 @@
 # ================================================================
 # services/data/app/external/apis/traffic.py
 # ================================================================
 """
 Traffic API abstraction layer for multiple cities
 """
 import asyncio
 from abc import ABC, abstractmethod
 from datetime import datetime
 from enum import Enum
 from typing import Dict, List, Any, Optional, Tuple
 import structlog
 logger = structlog.get_logger()
 class SupportedCity(Enum):
    """Supported cities for traffic data collection"""
    MADRID = "madrid"
    BARCELONA = "barcelona"
    VALENCIA = "valencia"
 class BaseTrafficClient(ABC):
    """
    Abstract base class for city-specific traffic clients
    Defines the contract that all traffic clients must implement
    """
    def __init__(self, city: SupportedCity):
        self.city = city
        self.logger = structlog.get_logger().bind(city=city.value)
    @abstractmethod
    async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
        """Get current traffic data for location"""
        pass
    @abstractmethod
    async def get_historical_traffic(self, latitude: float, longitude: float, 
                                   start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
        """Get historical traffic data"""
        pass
    @abstractmethod
    async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
        """Get traffic incidents and events"""
        pass
    @abstractmethod
    def supports_location(self, latitude: float, longitude: float) -> bool:
        """Check if this client supports the given location"""
        pass
 class TrafficAPIClientFactory:
    """
    Factory class to create appropriate traffic clients based on location
    """
    # City geographical bounds
    CITY_BOUNDS = {
        SupportedCity.MADRID: {
            'lat_min': 40.31, 'lat_max': 40.56,
            'lon_min': -3.89, 'lon_max': -3.51
        },
        SupportedCity.BARCELONA: {
            'lat_min': 41.32, 'lat_max': 41.47,
            'lon_min': 2.05, 'lon_max': 2.25
        },
        SupportedCity.VALENCIA: {
            'lat_min': 39.42, 'lat_max': 39.52,
            'lon_min': -0.42, 'lon_max': -0.32
        }
    }
    @classmethod
    def get_client_for_location(cls, latitude: float, longitude: float) -> Optional[BaseTrafficClient]:
        """
        Get appropriate traffic client for given location
        Args:
            latitude: Query location latitude
            longitude: Query location longitude
        Returns:
            BaseTrafficClient instance or None if location not supported
        """
        try:
            # Check each city's bounds
            for city, bounds in cls.CITY_BOUNDS.items():
                if (bounds['lat_min'] <= latitude <= bounds['lat_max'] and 
                    bounds['lon_min'] <= longitude <= bounds['lon_max']):
                    logger.info("Location matched to city", 
                              city=city.value, lat=latitude, lon=longitude)
                    return cls._create_client(city)
            # If no specific city matches, try to find closest supported city
            closest_city = cls._find_closest_city(latitude, longitude)
            if closest_city:
                logger.info("Using closest city for location", 
                          closest_city=closest_city.value, lat=latitude, lon=longitude)
                return cls._create_client(closest_city)
            logger.warning("No traffic client available for location", 
                         lat=latitude, lon=longitude)
            return None
        except Exception as e:
            logger.error("Error getting traffic client for location", 
                        lat=latitude, lon=longitude, error=str(e))
            return None
    @classmethod
    def _create_client(cls, city: SupportedCity) -> BaseTrafficClient:
        """Create traffic client for specific city"""
        if city == SupportedCity.MADRID:
            from .madrid_traffic_client import MadridTrafficClient
            return MadridTrafficClient()
        elif city == SupportedCity.BARCELONA:
            # Future implementation
            raise NotImplementedError(f"Traffic client for {city.value} not yet implemented")
        elif city == SupportedCity.VALENCIA:
            # Future implementation
            raise NotImplementedError(f"Traffic client for {city.value} not yet implemented")
        else:
            raise ValueError(f"Unsupported city: {city}")
    @classmethod
    def _find_closest_city(cls, latitude: float, longitude: float) -> Optional[SupportedCity]:
        """Find closest supported city to given coordinates"""
        import math
        def distance(lat1, lon1, lat2, lon2):
            """Calculate distance between two coordinates"""
            R = 6371  # Earth's radius in km
            dlat = math.radians(lat2 - lat1)
            dlon = math.radians(lon2 - lon1)
            a = (math.sin(dlat/2) * math.sin(dlat/2) +
                 math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
                 math.sin(dlon/2) * math.sin(dlon/2))
            c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
            return R * c
        min_distance = float('inf')
        closest_city = None
        # City centers for distance calculation
        city_centers = {
            SupportedCity.MADRID: (40.4168, -3.7038),
            SupportedCity.BARCELONA: (41.3851, 2.1734),
            SupportedCity.VALENCIA: (39.4699, -0.3763)
        }
        for city, (city_lat, city_lon) in city_centers.items():
            dist = distance(latitude, longitude, city_lat, city_lon)
            if dist < min_distance and dist < 100:  # Within 100km
                min_distance = dist
                closest_city = city
        return closest_city
    @classmethod
    def get_supported_cities(cls) -> List[Dict[str, Any]]:
        """Get list of supported cities with their bounds"""
        cities = []
        for city, bounds in cls.CITY_BOUNDS.items():
            cities.append({
                "city": city.value,
                "bounds": bounds,
                "status": "active" if city == SupportedCity.MADRID else "planned"
            })
        return cities
 class UniversalTrafficClient:
    """
    Universal traffic client that delegates to appropriate city-specific clients
    This is the main interface that external services should use
    """
    def __init__(self):
        self.factory = TrafficAPIClientFactory()
        self.client_cache = {}  # Cache clients for performance
    async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[Dict[str, Any]]:
        """Get current traffic data for any supported location"""
        try:
            client = self._get_client_for_location(latitude, longitude)
            if client:
                return await client.get_current_traffic(latitude, longitude)
            else:
                logger.warning("No traffic data available for location", 
                             lat=latitude, lon=longitude)
                return None
        except Exception as e:
            logger.error("Error getting current traffic", 
                        lat=latitude, lon=longitude, error=str(e))
            return None
    async def get_historical_traffic(self, latitude: float, longitude: float, 
                                   start_date: datetime, end_date: datetime) -> List[Dict[str, Any]]:
        """Get historical traffic data for any supported location"""
        try:
            client = self._get_client_for_location(latitude, longitude)
            if client:
                return await client.get_historical_traffic(latitude, longitude, start_date, end_date)
            else:
                logger.warning("No historical traffic data available for location", 
                             lat=latitude, lon=longitude)
                return []
        except Exception as e:
            logger.error("Error getting historical traffic", 
                        lat=latitude, lon=longitude, error=str(e))
            return []
    async def get_events(self, latitude: float, longitude: float, radius_km: float = 5.0) -> List[Dict[str, Any]]:
        """Get traffic events for any supported location"""
        try:
            client = self._get_client_for_location(latitude, longitude)
            if client:
                return await client.get_events(latitude, longitude, radius_km)
            else:
                return []
        except Exception as e:
            logger.error("Error getting traffic events", 
                        lat=latitude, lon=longitude, error=str(e))
            return []
    def _get_client_for_location(self, latitude: float, longitude: float) -> Optional[BaseTrafficClient]:
        """Get cached or create new client for location"""
        cache_key = f"{latitude:.4f},{longitude:.4f}"
        if cache_key not in self.client_cache:
            client = self.factory.get_client_for_location(latitude, longitude)
            self.client_cache[cache_key] = client
        return self.client_cache[cache_key]
    def get_location_info(self, latitude: float, longitude: float) -> Dict[str, Any]:
        """Get information about traffic data availability for location"""
        client = self._get_client_for_location(latitude, longitude)
        if client:
            return {
                "supported": True,
                "city": client.city.value,
                "features": ["current_traffic", "historical_traffic", "events"]
            }
        else:
            return {
                "supported": False,
                "city": None,
                "features": [],
                "message": "No traffic data available for this location"
            }
--- a/services/data/app/external/base_client.py
+++ b/services/data/app/external/base_client.py
@@ -54,6 +54,19 @@ class BaseAPIClient:
            logger.error("Unexpected error", error=str(e), url=url)
            return None
    async def get(self, url: str, headers: Optional[Dict] = None, timeout: Optional[int] = None) -> httpx.Response:
        """
        Public GET method for direct HTTP requests
        Returns the raw httpx Response object for maximum flexibility
        """
        request_headers = headers or {}
        request_timeout = httpx.Timeout(timeout if timeout else 30.0)
        async with httpx.AsyncClient(timeout=request_timeout, follow_redirects=True) as client:
            response = await client.get(url, headers=request_headers)
            response.raise_for_status()
            return response
    async def _fetch_url_directly(self, url: str, headers: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
        """Fetch data directly from a full URL (for AEMET datos URLs)"""
        try:
@@ -124,3 +137,16 @@ class BaseAPIClient:
        except Exception as e:
            logger.error("Unexpected error", error=str(e), url=url)
            return None
    async def get(self, url: str, headers: Optional[Dict] = None, timeout: Optional[int] = None) -> httpx.Response:
        """
        Public GET method for direct HTTP requests
        Returns the raw httpx Response object for maximum flexibility
        """
        request_headers = headers or {}
        request_timeout = httpx.Timeout(timeout if timeout else 30.0)
        async with httpx.AsyncClient(timeout=request_timeout, follow_redirects=True) as client:
            response = await client.get(url, headers=request_headers)
            response.raise_for_status()
            return response
--- a/services/data/app/external/madrid_opendata.py
+++ b/services/data/app/external/madrid_opendata.py
--- a/services/data/app/models/traffic.py
+++ b/services/data/app/models/traffic.py
@@ -1,30 +1,294 @@
 # ================================================================
-# services/data/app/models/traffic.py
+# services/data/app/models/traffic.py - Enhanced for Multiple Cities
 # ================================================================
-"""Traffic data models"""
+"""
 Flexible traffic data models supporting multiple cities and extensible schemas
 """
-from sqlalchemy import Column, String, DateTime, Float, Integer, Text, Index
+from sqlalchemy import Column, String, DateTime, Float, Integer, Text, Index, Boolean, JSON
 from sqlalchemy.dialects.postgresql import UUID
 import uuid
 from datetime import datetime, timezone
 from typing import Dict, Any, Optional
 from shared.database.base import Base
 class TrafficData(Base):
    """
    Flexible traffic data model supporting multiple cities
    Designed to accommodate varying data structures across different cities
    """
    __tablename__ = "traffic_data"
    # Primary identification
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
-    location_id = Column(String(100), nullable=False, index=True)
+    
    # Location and temporal data
    location_id = Column(String(100), nullable=False, index=True)  # "lat,lon" or city-specific ID
    city = Column(String(50), nullable=False, index=True)  # madrid, barcelona, valencia, etc.
    date = Column(DateTime(timezone=True), nullable=False, index=True)
-    traffic_volume = Column(Integer, nullable=True)  # vehicles per hour
+    
-    pedestrian_count = Column(Integer, nullable=True)  # pedestrians per hour
+    # Core standardized traffic metrics (common across all cities)
-    congestion_level = Column(String(20), nullable=True)  # low/medium/high
+    traffic_volume = Column(Integer, nullable=True)  # Vehicle count or intensity
-    average_speed = Column(Float, nullable=True)  # km/h
+    congestion_level = Column(String(20), nullable=True)  # low, medium, high, blocked
-    source = Column(String(50), nullable=False, default="madrid_opendata")
+    average_speed = Column(Float, nullable=True)  # Average speed in km/h
-    raw_data = Column(Text, nullable=True)
+    
    # Enhanced metrics (may not be available for all cities)
    occupation_percentage = Column(Float, nullable=True)  # Road occupation %
    load_percentage = Column(Float, nullable=True)  # Traffic load %
    pedestrian_count = Column(Integer, nullable=True)  # Estimated pedestrian count
    # Measurement point information
    measurement_point_id = Column(String(100), nullable=True, index=True)
    measurement_point_name = Column(String(500), nullable=True)
    measurement_point_type = Column(String(50), nullable=True)  # URB, M30, A, etc.
    # Geographic data
    latitude = Column(Float, nullable=True)
    longitude = Column(Float, nullable=True)
    district = Column(String(100), nullable=True)  # City district/area
    zone = Column(String(100), nullable=True)  # Traffic zone or sector
    # Data source and quality
    source = Column(String(50), nullable=False, default="unknown")  # madrid_opendata, synthetic, etc.
    data_quality_score = Column(Float, nullable=True)  # Quality score 0-100
    is_synthetic = Column(Boolean, default=False)
    has_pedestrian_inference = Column(Boolean, default=False)
    # City-specific data (flexible JSON storage)
    city_specific_data = Column(JSON, nullable=True)  # Store city-specific fields
    # Raw data backup
    raw_data = Column(Text, nullable=True)  # Original data for debugging
    # Audit fields
    tenant_id = Column(UUID(as_uuid=True), nullable=True, index=True)  # For multi-tenancy
    created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
-    updated_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc))
+    updated_at = Column(DateTime(timezone=True), 
                       default=lambda: datetime.now(timezone.utc), 
                       onupdate=lambda: datetime.now(timezone.utc))
    # Performance-optimized indexes
    __table_args__ = (
        # Core query patterns
        Index('idx_traffic_location_date', 'location_id', 'date'),
        Index('idx_traffic_city_date', 'city', 'date'),
        Index('idx_traffic_tenant_date', 'tenant_id', 'date'),
        # Advanced query patterns
        Index('idx_traffic_city_location', 'city', 'location_id'),
        Index('idx_traffic_measurement_point', 'city', 'measurement_point_id'),
        Index('idx_traffic_district_date', 'city', 'district', 'date'),
        # Training data queries
        Index('idx_traffic_training', 'tenant_id', 'city', 'date', 'is_synthetic'),
        Index('idx_traffic_quality', 'city', 'data_quality_score', 'date'),
    )
    def to_dict(self) -> Dict[str, Any]:
        """Convert model to dictionary for API responses"""
        result = {
            'id': str(self.id),
            'location_id': self.location_id,
            'city': self.city,
            'date': self.date.isoformat() if self.date else None,
            'traffic_volume': self.traffic_volume,
            'congestion_level': self.congestion_level,
            'average_speed': self.average_speed,
            'occupation_percentage': self.occupation_percentage,
            'load_percentage': self.load_percentage,
            'pedestrian_count': self.pedestrian_count,
            'measurement_point_id': self.measurement_point_id,
            'measurement_point_name': self.measurement_point_name,
            'measurement_point_type': self.measurement_point_type,
            'latitude': self.latitude,
            'longitude': self.longitude,
            'district': self.district,
            'zone': self.zone,
            'source': self.source,
            'data_quality_score': self.data_quality_score,
            'is_synthetic': self.is_synthetic,
            'has_pedestrian_inference': self.has_pedestrian_inference,
            'created_at': self.created_at.isoformat() if self.created_at else None
        }
        # Add city-specific data if present
        if self.city_specific_data:
            result['city_specific_data'] = self.city_specific_data
        return result
    def get_city_specific_field(self, field_name: str, default: Any = None) -> Any:
        """Safely get city-specific field value"""
        if self.city_specific_data and isinstance(self.city_specific_data, dict):
            return self.city_specific_data.get(field_name, default)
        return default
    def set_city_specific_field(self, field_name: str, value: Any) -> None:
        """Set city-specific field value"""
        if not self.city_specific_data:
            self.city_specific_data = {}
        if not isinstance(self.city_specific_data, dict):
            self.city_specific_data = {}
        self.city_specific_data[field_name] = value
 class TrafficMeasurementPoint(Base):
    """
    Registry of traffic measurement points across all cities
    Supports different city-specific measurement point schemas
    """
    __tablename__ = "traffic_measurement_points"
    # Primary identification
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    # Location and identification
    city = Column(String(50), nullable=False, index=True)
    measurement_point_id = Column(String(100), nullable=False, index=True)  # City-specific ID
    name = Column(String(500), nullable=True)
    description = Column(Text, nullable=True)
    # Geographic information
    latitude = Column(Float, nullable=False)
    longitude = Column(Float, nullable=False)
    district = Column(String(100), nullable=True)
    zone = Column(String(100), nullable=True)
    # Classification
    road_type = Column(String(50), nullable=True)  # URB, M30, A, etc.
    measurement_type = Column(String(50), nullable=True)  # intensity, speed, etc.
    point_category = Column(String(50), nullable=True)  # urban, highway, ring_road
    # Status and metadata
    is_active = Column(Boolean, default=True)
    installation_date = Column(DateTime(timezone=True), nullable=True)
    last_data_received = Column(DateTime(timezone=True), nullable=True)
    data_quality_rating = Column(Float, nullable=True)  # Average quality 0-100
    # City-specific point data
    city_specific_metadata = Column(JSON, nullable=True)
    # Audit fields
    created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
    updated_at = Column(DateTime(timezone=True), 
                       default=lambda: datetime.now(timezone.utc), 
                       onupdate=lambda: datetime.now(timezone.utc))
    __table_args__ = (
-        Index('idx_traffic_location_date', 'location_id', 'date'),
+        # Ensure unique measurement points per city
        Index('idx_unique_city_point', 'city', 'measurement_point_id', unique=True),
        # Geographic queries
        Index('idx_points_city_location', 'city', 'latitude', 'longitude'),
        Index('idx_points_district', 'city', 'district'),
        Index('idx_points_road_type', 'city', 'road_type'),
        # Status queries
        Index('idx_points_active', 'city', 'is_active', 'last_data_received'),
    )
    def to_dict(self) -> Dict[str, Any]:
        """Convert measurement point to dictionary"""
        return {
            'id': str(self.id),
            'city': self.city,
            'measurement_point_id': self.measurement_point_id,
            'name': self.name,
            'description': self.description,
            'latitude': self.latitude,
            'longitude': self.longitude,
            'district': self.district,
            'zone': self.zone,
            'road_type': self.road_type,
            'measurement_type': self.measurement_type,
            'point_category': self.point_category,
            'is_active': self.is_active,
            'installation_date': self.installation_date.isoformat() if self.installation_date else None,
            'last_data_received': self.last_data_received.isoformat() if self.last_data_received else None,
            'data_quality_rating': self.data_quality_rating,
            'city_specific_metadata': self.city_specific_metadata,
            'created_at': self.created_at.isoformat() if self.created_at else None
        }
 class TrafficDataBackgroundJob(Base):
    """
    Track background data collection jobs for multiple cities
    Supports scheduling and monitoring of data fetching processes
    """
    __tablename__ = "traffic_background_jobs"
    # Primary identification
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    # Job configuration
    job_type = Column(String(50), nullable=False)  # historical_fetch, cleanup, etc.
    city = Column(String(50), nullable=False, index=True)
    location_pattern = Column(String(200), nullable=True)  # Location pattern or specific coords
    # Scheduling
    scheduled_at = Column(DateTime(timezone=True), nullable=False)
    started_at = Column(DateTime(timezone=True), nullable=True)
    completed_at = Column(DateTime(timezone=True), nullable=True)
    # Status tracking
    status = Column(String(20), nullable=False, default='pending')  # pending, running, completed, failed
    progress_percentage = Column(Float, default=0.0)
    records_processed = Column(Integer, default=0)
    records_stored = Column(Integer, default=0)
    # Date range for data jobs
    data_start_date = Column(DateTime(timezone=True), nullable=True)
    data_end_date = Column(DateTime(timezone=True), nullable=True)
    # Results and error handling
    success_count = Column(Integer, default=0)
    error_count = Column(Integer, default=0)
    error_message = Column(Text, nullable=True)
    job_metadata = Column(JSON, nullable=True)  # Additional job-specific data
    # Tenant association
    tenant_id = Column(UUID(as_uuid=True), nullable=True, index=True)
    # Audit fields
    created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
    updated_at = Column(DateTime(timezone=True), 
                       default=lambda: datetime.now(timezone.utc), 
                       onupdate=lambda: datetime.now(timezone.utc))
    __table_args__ = (
        # Job monitoring
        Index('idx_jobs_city_status', 'city', 'status', 'scheduled_at'),
        Index('idx_jobs_tenant_status', 'tenant_id', 'status', 'scheduled_at'),
        Index('idx_jobs_type_city', 'job_type', 'city', 'scheduled_at'),
        # Cleanup queries
        Index('idx_jobs_completed', 'status', 'completed_at'),
    )
    def to_dict(self) -> Dict[str, Any]:
        """Convert job to dictionary"""
        return {
            'id': str(self.id),
            'job_type': self.job_type,
            'city': self.city,
            'location_pattern': self.location_pattern,
            'scheduled_at': self.scheduled_at.isoformat() if self.scheduled_at else None,
            'started_at': self.started_at.isoformat() if self.started_at else None,
            'completed_at': self.completed_at.isoformat() if self.completed_at else None,
            'status': self.status,
            'progress_percentage': self.progress_percentage,
            'records_processed': self.records_processed,
            'records_stored': self.records_stored,
            'data_start_date': self.data_start_date.isoformat() if self.data_start_date else None,
            'data_end_date': self.data_end_date.isoformat() if self.data_end_date else None,
            'success_count': self.success_count,
            'error_count': self.error_count,
            'error_message': self.error_message,
            'job_metadata': self.job_metadata,
            'tenant_id': str(self.tenant_id) if self.tenant_id else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            'updated_at': self.updated_at.isoformat() if self.updated_at else None
        }
--- a/services/data/app/repositories/traffic_repository.py
+++ b/services/data/app/repositories/traffic_repository.py
@@ -0,0 +1,874 @@
 # ================================================================
 # services/data/app/repositories/traffic_repository.py
 # ================================================================
 """
 Traffic Repository - Enhanced for multiple cities with comprehensive data access patterns
 Follows existing repository architecture while adding city-specific functionality
 """
 from typing import Optional, List, Dict, Any, Type, Tuple
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select, and_, or_, func, desc, asc, text, update, delete
 from sqlalchemy.orm import selectinload
 from datetime import datetime, timezone, timedelta
 import structlog
 from .base import DataBaseRepository
 from app.models.traffic import TrafficData, TrafficMeasurementPoint, TrafficDataBackgroundJob
 from app.schemas.traffic import TrafficDataCreate, TrafficDataResponse
 from shared.database.exceptions import DatabaseError, ValidationError
 logger = structlog.get_logger()
 class TrafficRepository(DataBaseRepository[TrafficData, TrafficDataCreate, Dict]):
    """
    Enhanced repository for traffic data operations across multiple cities
    Provides city-aware queries and advanced traffic analytics
    """
    def __init__(self, model_class: Type, session: AsyncSession, cache_ttl: Optional[int] = 300):
        super().__init__(model_class, session, cache_ttl)
    # ================================================================
    # CORE TRAFFIC DATA OPERATIONS
    # ================================================================
    async def get_by_location_and_date_range(
        self,
        latitude: float,
        longitude: float,
        start_date: Optional[datetime] = None,
        end_date: Optional[datetime] = None,
        city: Optional[str] = None,
        tenant_id: Optional[str] = None,
        skip: int = 0,
        limit: int = 100
    ) -> List[TrafficData]:
        """Get traffic data by location and date range with city filtering"""
        try:
            location_id = f"{latitude:.4f},{longitude:.4f}"
            # Build base query
            query = select(self.model).where(self.model.location_id == location_id)
            # Add city filter if specified
            if city:
                query = query.where(self.model.city == city)
            # Add tenant filter if specified
            if tenant_id:
                query = query.where(self.model.tenant_id == tenant_id)
            # Add date range filters
            if start_date:
                start_date = self._ensure_utc_datetime(start_date)
                query = query.where(self.model.date >= start_date)
            if end_date:
                end_date = self._ensure_utc_datetime(end_date)
                query = query.where(self.model.date <= end_date)
            # Order by date descending (most recent first)
            query = query.order_by(desc(self.model.date))
            # Apply pagination
            query = query.offset(skip).limit(limit)
            result = await self.session.execute(query)
            return result.scalars().all()
        except Exception as e:
            logger.error("Failed to get traffic data by location and date range",
                        latitude=latitude, longitude=longitude,
                        city=city, error=str(e))
            raise DatabaseError(f"Failed to get traffic data: {str(e)}")
    async def get_by_city_and_date_range(
        self,
        city: str,
        start_date: Optional[datetime] = None,
        end_date: Optional[datetime] = None,
        district: Optional[str] = None,
        measurement_point_ids: Optional[List[str]] = None,
        include_synthetic: bool = True,
        tenant_id: Optional[str] = None,
        skip: int = 0,
        limit: int = 1000
    ) -> List[TrafficData]:
        """Get traffic data by city with advanced filtering options"""
        try:
            # Build base query
            query = select(self.model).where(self.model.city == city)
            # Add tenant filter if specified
            if tenant_id:
                query = query.where(self.model.tenant_id == tenant_id)
            # Add date range filters
            if start_date:
                start_date = self._ensure_utc_datetime(start_date)
                query = query.where(self.model.date >= start_date)
            if end_date:
                end_date = self._ensure_utc_datetime(end_date)
                query = query.where(self.model.date <= end_date)
            # Add district filter
            if district:
                query = query.where(self.model.district == district)
            # Add measurement point filter
            if measurement_point_ids:
                query = query.where(self.model.measurement_point_id.in_(measurement_point_ids))
            # Filter synthetic data if requested
            if not include_synthetic:
                query = query.where(self.model.is_synthetic == False)
            # Order by date and measurement point
            query = query.order_by(desc(self.model.date), self.model.measurement_point_id)
            # Apply pagination
            query = query.offset(skip).limit(limit)
            result = await self.session.execute(query)
            return result.scalars().all()
        except Exception as e:
            logger.error("Failed to get traffic data by city",
                        city=city, district=district, error=str(e))
            raise DatabaseError(f"Failed to get traffic data: {str(e)}")
    async def get_latest_by_measurement_points(
        self,
        measurement_point_ids: List[str],
        city: str,
        hours_back: int = 24
    ) -> List[TrafficData]:
        """Get latest traffic data for specific measurement points"""
        try:
            cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours_back)
            query = select(self.model).where(
                and_(
                    self.model.city == city,
                    self.model.measurement_point_id.in_(measurement_point_ids),
                    self.model.date >= cutoff_time
                )
            ).order_by(
                self.model.measurement_point_id,
                desc(self.model.date)
            )
            result = await self.session.execute(query)
            all_records = result.scalars().all()
            # Get the latest record for each measurement point
            latest_records = {}
            for record in all_records:
                point_id = record.measurement_point_id
                if point_id not in latest_records:
                    latest_records[point_id] = record
            return list(latest_records.values())
        except Exception as e:
            logger.error("Failed to get latest traffic data by measurement points",
                        city=city, points=len(measurement_point_ids), error=str(e))
            raise DatabaseError(f"Failed to get latest traffic data: {str(e)}")
    # ================================================================
    # ANALYTICS AND AGGREGATIONS
    # ================================================================
    async def get_traffic_statistics_by_city(
        self,
        city: str,
        start_date: Optional[datetime] = None,
        end_date: Optional[datetime] = None,
        group_by: str = "daily"
    ) -> List[Dict[str, Any]]:
        """Get aggregated traffic statistics by city"""
        try:
            # Determine date truncation based on group_by
            if group_by == "hourly":
                date_trunc = "hour"
            elif group_by == "daily":
                date_trunc = "day"
            elif group_by == "weekly":
                date_trunc = "week"
            elif group_by == "monthly":
                date_trunc = "month"
            else:
                raise ValidationError(f"Invalid group_by value: {group_by}")
            # Build aggregation query
            if self.session.bind.dialect.name == 'postgresql':
                query = text("""
                    SELECT 
                        DATE_TRUNC(:date_trunc, date) as period,
                        city,
                        district,
                        COUNT(*) as record_count,
                        AVG(traffic_volume) as avg_traffic_volume,
                        MAX(traffic_volume) as max_traffic_volume,
                        AVG(pedestrian_count) as avg_pedestrian_count,
                        AVG(average_speed) as avg_speed,
                        COUNT(CASE WHEN congestion_level = 'high' THEN 1 END) as high_congestion_count,
                        COUNT(CASE WHEN is_synthetic = false THEN 1 END) as real_data_count,
                        COUNT(CASE WHEN has_pedestrian_inference = true THEN 1 END) as pedestrian_inference_count
                    FROM traffic_data 
                    WHERE city = :city
                """)
            else:
                # SQLite fallback
                query = text("""
                    SELECT 
                        DATE(date) as period,
                        city,
                        district,
                        COUNT(*) as record_count,
                        AVG(traffic_volume) as avg_traffic_volume,
                        MAX(traffic_volume) as max_traffic_volume,
                        AVG(pedestrian_count) as avg_pedestrian_count,
                        AVG(average_speed) as avg_speed,
                        SUM(CASE WHEN congestion_level = 'high' THEN 1 ELSE 0 END) as high_congestion_count,
                        SUM(CASE WHEN is_synthetic = 0 THEN 1 ELSE 0 END) as real_data_count,
                        SUM(CASE WHEN has_pedestrian_inference = 1 THEN 1 ELSE 0 END) as pedestrian_inference_count
                    FROM traffic_data 
                    WHERE city = :city
                """)
            params = {
                "city": city,
                "date_trunc": date_trunc
            }
            # Add date filters
            if start_date:
                query = text(str(query) + " AND date >= :start_date")
                params["start_date"] = self._ensure_utc_datetime(start_date)
            if end_date:
                query = text(str(query) + " AND date <= :end_date")
                params["end_date"] = self._ensure_utc_datetime(end_date)
            # Add GROUP BY and ORDER BY
            query = text(str(query) + " GROUP BY period, city, district ORDER BY period DESC")
            result = await self.session.execute(query, params)
            rows = result.fetchall()
            # Convert to list of dictionaries
            statistics = []
            for row in rows:
                statistics.append({
                    "period": group_by,
                    "date": row.period,
                    "city": row.city,
                    "district": row.district,
                    "record_count": row.record_count,
                    "avg_traffic_volume": float(row.avg_traffic_volume or 0),
                    "max_traffic_volume": row.max_traffic_volume or 0,
                    "avg_pedestrian_count": float(row.avg_pedestrian_count or 0),
                    "avg_speed": float(row.avg_speed or 0),
                    "high_congestion_count": row.high_congestion_count or 0,
                    "real_data_percentage": round((row.real_data_count or 0) / max(1, row.record_count) * 100, 2),
                    "pedestrian_inference_percentage": round((row.pedestrian_inference_count or 0) / max(1, row.record_count) * 100, 2)
                })
            return statistics
        except Exception as e:
            logger.error("Failed to get traffic statistics by city",
                        city=city, group_by=group_by, error=str(e))
            raise DatabaseError(f"Traffic statistics query failed: {str(e)}")
    async def get_congestion_heatmap_data(
        self,
        city: str,
        start_date: datetime,
        end_date: datetime,
        time_granularity: str = "hour"
    ) -> List[Dict[str, Any]]:
        """Get congestion data for heatmap visualization"""
        try:
            if time_granularity == "hour":
                time_extract = "EXTRACT(hour FROM date)"
            elif time_granularity == "day_of_week":
                time_extract = "EXTRACT(dow FROM date)"
            else:
                time_extract = "EXTRACT(hour FROM date)"
            query = text(f"""
                SELECT 
                    {time_extract} as time_period,
                    district,
                    measurement_point_id,
                    latitude,
                    longitude,
                    AVG(CASE 
                        WHEN congestion_level = 'low' THEN 1
                        WHEN congestion_level = 'medium' THEN 2
                        WHEN congestion_level = 'high' THEN 3
                        WHEN congestion_level = 'blocked' THEN 4
                        ELSE 1
                    END) as avg_congestion_score,
                    COUNT(*) as data_points,
                    AVG(traffic_volume) as avg_traffic_volume,
                    AVG(pedestrian_count) as avg_pedestrian_count
                FROM traffic_data 
                WHERE city = :city 
                    AND date >= :start_date 
                    AND date <= :end_date
                    AND latitude IS NOT NULL 
                    AND longitude IS NOT NULL
                GROUP BY time_period, district, measurement_point_id, latitude, longitude
                ORDER BY time_period, district, avg_congestion_score DESC
            """)
            params = {
                "city": city,
                "start_date": self._ensure_utc_datetime(start_date),
                "end_date": self._ensure_utc_datetime(end_date)
            }
            result = await self.session.execute(query, params)
            rows = result.fetchall()
            heatmap_data = []
            for row in rows:
                heatmap_data.append({
                    "time_period": int(row.time_period or 0),
                    "district": row.district,
                    "measurement_point_id": row.measurement_point_id,
                    "latitude": float(row.latitude),
                    "longitude": float(row.longitude),
                    "avg_congestion_score": float(row.avg_congestion_score),
                    "data_points": row.data_points,
                    "avg_traffic_volume": float(row.avg_traffic_volume or 0),
                    "avg_pedestrian_count": float(row.avg_pedestrian_count or 0)
                })
            return heatmap_data
        except Exception as e:
            logger.error("Failed to get congestion heatmap data",
                        city=city, error=str(e))
            raise DatabaseError(f"Congestion heatmap query failed: {str(e)}")
    # ================================================================
    # BULK OPERATIONS AND DATA MANAGEMENT
    # ================================================================
    async def create_bulk_traffic_data(
        self, 
        traffic_records: List[Dict[str, Any]], 
        city: str,
        tenant_id: Optional[str] = None
    ) -> List[TrafficData]:
        """Create multiple traffic records in bulk with enhanced validation"""
        try:
            # Ensure all records have city and tenant_id
            for record in traffic_records:
                record["city"] = city
                if tenant_id:
                    record["tenant_id"] = tenant_id
                # Ensure dates are timezone-aware
                if "date" in record and record["date"]:
                    record["date"] = self._ensure_utc_datetime(record["date"])
            # Enhanced validation
            validated_records = []
            for record in traffic_records:
                if self._validate_traffic_record(record):
                    validated_records.append(record)
                else:
                    logger.warning("Invalid traffic record skipped", 
                                 city=city, record_keys=list(record.keys()))
            if not validated_records:
                logger.warning("No valid traffic records to create", city=city)
                return []
            # Use bulk create with deduplication
            created_records = await self.bulk_create_with_deduplication(validated_records)
            logger.info("Bulk traffic data creation completed",
                       city=city, requested=len(traffic_records),
                       validated=len(validated_records), created=len(created_records))
            return created_records
        except Exception as e:
            logger.error("Failed to create bulk traffic data",
                        city=city, record_count=len(traffic_records), error=str(e))
            raise DatabaseError(f"Bulk traffic creation failed: {str(e)}")
    async def bulk_create_with_deduplication(
        self, 
        records: List[Dict[str, Any]]
    ) -> List[TrafficData]:
        """Bulk create with automatic deduplication based on location, city, and date"""
        try:
            if not records:
                return []
            # Extract unique keys for deduplication check
            unique_keys = []
            for record in records:
                key = (
                    record.get('location_id'),
                    record.get('city'),
                    record.get('date'),
                    record.get('measurement_point_id')
                )
                unique_keys.append(key)
            # Check for existing records
            location_ids = [key[0] for key in unique_keys if key[0]]
            cities = [key[1] for key in unique_keys if key[1]]
            dates = [key[2] for key in unique_keys if key[2]]
            # For large datasets, use chunked deduplication to avoid memory issues
            if len(location_ids) > 1000:
                logger.info(f"Large dataset detected ({len(records)} records), using chunked deduplication")
                new_records = []
                chunk_size = 1000
                for i in range(0, len(records), chunk_size):
                    chunk_records = records[i:i + chunk_size]
                    chunk_keys = unique_keys[i:i + chunk_size]
                    # Get unique values for this chunk
                    chunk_location_ids = list(set(key[0] for key in chunk_keys if key[0]))
                    chunk_cities = list(set(key[1] for key in chunk_keys if key[1]))
                    chunk_dates = list(set(key[2] for key in chunk_keys if key[2]))
                    if chunk_location_ids and chunk_cities and chunk_dates:
                        existing_query = select(
                            self.model.location_id,
                            self.model.city,
                            self.model.date,
                            self.model.measurement_point_id
                        ).where(
                            and_(
                                self.model.location_id.in_(chunk_location_ids),
                                self.model.city.in_(chunk_cities),
                                self.model.date.in_(chunk_dates)
                            )
                        )
                        result = await self.session.execute(existing_query)
                        chunk_existing_keys = set(result.fetchall())
                        # Filter chunk duplicates
                        for j, record in enumerate(chunk_records):
                            key = chunk_keys[j]
                            if key not in chunk_existing_keys:
                                new_records.append(record)
                    else:
                        new_records.extend(chunk_records)
                logger.debug("Chunked deduplication completed",
                           total_records=len(records),
                           new_records=len(new_records))
                records = new_records
            elif location_ids and cities and dates:
                existing_query = select(
                    self.model.location_id,
                    self.model.city,
                    self.model.date,
                    self.model.measurement_point_id
                ).where(
                    and_(
                        self.model.location_id.in_(location_ids),
                        self.model.city.in_(cities),
                        self.model.date.in_(dates)
                    )
                )
                result = await self.session.execute(existing_query)
                existing_keys = set(result.fetchall())
                # Filter out duplicates
                new_records = []
                for i, record in enumerate(records):
                    key = unique_keys[i]
                    if key not in existing_keys:
                        new_records.append(record)
                logger.debug("Standard deduplication completed",
                           total_records=len(records),
                           existing_records=len(existing_keys),
                           new_records=len(new_records))
                records = new_records
            # Proceed with bulk creation
            return await self.bulk_create(records)
        except Exception as e:
            logger.error("Failed bulk create with deduplication", error=str(e))
            raise DatabaseError(f"Bulk create with deduplication failed: {str(e)}")
    def _validate_traffic_record(self, record: Dict[str, Any]) -> bool:
        """Enhanced validation for traffic records"""
        required_fields = ['date', 'city']
        # Check required fields
        for field in required_fields:
            if not record.get(field):
                return False
        # Validate city
        city = record.get('city', '').lower()
        if city not in ['madrid', 'barcelona', 'valencia', 'test']:  # Extendable list
            return False
        # Validate data ranges
        traffic_volume = record.get('traffic_volume')
        if traffic_volume is not None and (traffic_volume < 0 or traffic_volume > 50000):
            return False
        pedestrian_count = record.get('pedestrian_count')
        if pedestrian_count is not None and (pedestrian_count < 0 or pedestrian_count > 10000):
            return False
        average_speed = record.get('average_speed')
        if average_speed is not None and (average_speed < 0 or average_speed > 200):
            return False
        congestion_level = record.get('congestion_level')
        if congestion_level and congestion_level not in ['low', 'medium', 'high', 'blocked']:
            return False
        return True
    # ================================================================
    # TRAINING DATA SPECIFIC OPERATIONS
    # ================================================================
    async def get_training_data_by_location(
        self,
        latitude: float,
        longitude: float,
        start_date: datetime,
        end_date: datetime,
        tenant_id: Optional[str] = None,
        include_pedestrian_inference: bool = True
    ) -> List[Dict[str, Any]]:
        """Get optimized training data for ML models"""
        try:
            location_id = f"{latitude:.4f},{longitude:.4f}"
            query = select(self.model).where(
                and_(
                    self.model.location_id == location_id,
                    self.model.date >= self._ensure_utc_datetime(start_date),
                    self.model.date <= self._ensure_utc_datetime(end_date)
                )
            )
            if tenant_id:
                query = query.where(self.model.tenant_id == tenant_id)
            if include_pedestrian_inference:
                # Prefer records with pedestrian inference
                query = query.order_by(
                    desc(self.model.has_pedestrian_inference),
                    desc(self.model.data_quality_score),
                    self.model.date
                )
            else:
                query = query.order_by(
                    desc(self.model.data_quality_score),
                    self.model.date
                )
            result = await self.session.execute(query)
            records = result.scalars().all()
            # Convert to training format with enhanced features
            training_data = []
            for record in records:
                training_record = {
                    'date': record.date,
                    'traffic_volume': record.traffic_volume or 0,
                    'pedestrian_count': record.pedestrian_count or 0,
                    'congestion_level': record.congestion_level or 'medium',
                    'average_speed': record.average_speed or 25.0,
                    'city': record.city,
                    'district': record.district,
                    'measurement_point_id': record.measurement_point_id,
                    'source': record.source,
                    'is_synthetic': record.is_synthetic or False,
                    'has_pedestrian_inference': record.has_pedestrian_inference or False,
                    'data_quality_score': record.data_quality_score or 50.0,
                    # Enhanced features for training
                    'hour_of_day': record.date.hour if record.date else 12,
                    'day_of_week': record.date.weekday() if record.date else 0,
                    'month': record.date.month if record.date else 1,
                    # City-specific features
                    'city_specific_data': record.city_specific_data or {}
                }
                training_data.append(training_record)
            logger.info("Retrieved training data",
                       location_id=location_id, records=len(training_data),
                       with_pedestrian_inference=sum(1 for r in training_data if r['has_pedestrian_inference']))
            return training_data
        except Exception as e:
            logger.error("Failed to get training data",
                        latitude=latitude, longitude=longitude, error=str(e))
            raise DatabaseError(f"Training data retrieval failed: {str(e)}")
    async def get_historical_data_by_location(
        self,
        latitude: float,
        longitude: float,
        start_date: datetime,
        end_date: datetime,
        tenant_id: Optional[str] = None
    ) -> List[TrafficData]:
        """Get historical traffic data for a specific location and date range"""
        return await self.get_by_location_and_date_range(
            latitude=latitude,
            longitude=longitude,
            start_date=start_date,
            end_date=end_date,
            tenant_id=tenant_id,
            limit=1000000  # Large limit for historical data
        )
    async def count_records_in_period(
        self,
        latitude: float,
        longitude: float,
        start_date: datetime,
        end_date: datetime,
        city: Optional[str] = None,
        tenant_id: Optional[str] = None
    ) -> int:
        """Count traffic records for a specific location and time period"""
        try:
            location_id = f"{latitude:.4f},{longitude:.4f}"
            query = select(func.count(self.model.id)).where(
                and_(
                    self.model.location_id == location_id,
                    self.model.date >= self._ensure_utc_datetime(start_date),
                    self.model.date <= self._ensure_utc_datetime(end_date)
                )
            )
            if city:
                query = query.where(self.model.city == city)
            if tenant_id:
                query = query.where(self.model.tenant_id == tenant_id)
            result = await self.session.execute(query)
            count = result.scalar()
            return count or 0
        except Exception as e:
            logger.error("Failed to count records in period",
                        latitude=latitude, longitude=longitude, error=str(e))
            raise DatabaseError(f"Record count failed: {str(e)}")
    # ================================================================
    # DATA QUALITY AND MAINTENANCE
    # ================================================================
    async def update_data_quality_scores(self, city: str) -> int:
        """Update data quality scores based on various criteria"""
        try:
            # Calculate quality scores based on data completeness and consistency
            query = text("""
                UPDATE traffic_data 
                SET data_quality_score = (
                    CASE 
                        WHEN traffic_volume IS NOT NULL THEN 20 ELSE 0 END +
                    CASE 
                        WHEN pedestrian_count IS NOT NULL THEN 20 ELSE 0 END +
                    CASE 
                        WHEN average_speed IS NOT NULL AND average_speed > 0 THEN 20 ELSE 0 END +
                    CASE 
                        WHEN congestion_level IS NOT NULL THEN 15 ELSE 0 END +
                    CASE 
                        WHEN measurement_point_id IS NOT NULL THEN 10 ELSE 0 END +
                    CASE 
                        WHEN district IS NOT NULL THEN 10 ELSE 0 END +
                    CASE 
                        WHEN has_pedestrian_inference = true THEN 5 ELSE 0 END
                ),
                updated_at = :updated_at
                WHERE city = :city AND data_quality_score IS NULL
            """)
            params = {
                "city": city,
                "updated_at": datetime.now(timezone.utc)
            }
            result = await self.session.execute(query, params)
            updated_count = result.rowcount
            await self.session.commit()
            logger.info("Updated data quality scores",
                       city=city, updated_count=updated_count)
            return updated_count
        except Exception as e:
            logger.error("Failed to update data quality scores",
                        city=city, error=str(e))
            await self.session.rollback()
            raise DatabaseError(f"Data quality update failed: {str(e)}")
    async def cleanup_old_synthetic_data(
        self, 
        city: str, 
        days_to_keep: int = 90
    ) -> int:
        """Clean up old synthetic data while preserving real data"""
        try:
            cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep)
            query = delete(self.model).where(
                and_(
                    self.model.city == city,
                    self.model.is_synthetic == True,
                    self.model.date < cutoff_date
                )
            )
            result = await self.session.execute(query)
            deleted_count = result.rowcount
            await self.session.commit()
            logger.info("Cleaned up old synthetic data",
                       city=city, deleted_count=deleted_count, days_kept=days_to_keep)
            return deleted_count
        except Exception as e:
            logger.error("Failed to cleanup old synthetic data",
                        city=city, error=str(e))
            await self.session.rollback()
            raise DatabaseError(f"Synthetic data cleanup failed: {str(e)}")
 class TrafficMeasurementPointRepository(DataBaseRepository[TrafficMeasurementPoint, Dict, Dict]):
    """Repository for traffic measurement points across cities"""
    async def get_points_near_location(
        self,
        latitude: float,
        longitude: float,
        city: str,
        radius_km: float = 10.0,
        limit: int = 20
    ) -> List[TrafficMeasurementPoint]:
        """Get measurement points near a location using spatial query"""
        try:
            # Simple distance calculation (for more precise, use PostGIS)
            query = text("""
                SELECT *,
                    (6371 * acos(
                        cos(radians(:lat)) * cos(radians(latitude)) * 
                        cos(radians(longitude) - radians(:lon)) + 
                        sin(radians(:lat)) * sin(radians(latitude))
                    )) as distance_km
                FROM traffic_measurement_points 
                WHERE city = :city 
                    AND is_active = true
                HAVING distance_km <= :radius_km
                ORDER BY distance_km
                LIMIT :limit
            """)
            params = {
                "lat": latitude,
                "lon": longitude,
                "city": city,
                "radius_km": radius_km,
                "limit": limit
            }
            result = await self.session.execute(query, params)
            rows = result.fetchall()
            # Convert rows to model instances
            points = []
            for row in rows:
                point = TrafficMeasurementPoint()
                for key, value in row._mapping.items():
                    if hasattr(point, key) and key != 'distance_km':
                        setattr(point, key, value)
                points.append(point)
            return points
        except Exception as e:
            logger.error("Failed to get measurement points near location",
                        latitude=latitude, longitude=longitude, city=city, error=str(e))
            raise DatabaseError(f"Measurement points query failed: {str(e)}")
 class TrafficBackgroundJobRepository(DataBaseRepository[TrafficDataBackgroundJob, Dict, Dict]):
    """Repository for managing background traffic data jobs"""
    async def get_pending_jobs_by_city(self, city: str) -> List[TrafficDataBackgroundJob]:
        """Get pending background jobs for a specific city"""
        try:
            query = select(self.model).where(
                and_(
                    self.model.city == city,
                    self.model.status == 'pending'
                )
            ).order_by(self.model.scheduled_at)
            result = await self.session.execute(query)
            return result.scalars().all()
        except Exception as e:
            logger.error("Failed to get pending jobs by city", city=city, error=str(e))
            raise DatabaseError(f"Background jobs query failed: {str(e)}")
    async def update_job_progress(
        self, 
        job_id: str, 
        progress_percentage: float, 
        records_processed: int = 0,
        records_stored: int = 0
    ) -> bool:
        """Update job progress"""
        try:
            query = update(self.model).where(
                self.model.id == job_id
            ).values(
                progress_percentage=progress_percentage,
                records_processed=records_processed,
                records_stored=records_stored,
                updated_at=datetime.now(timezone.utc)
            )
            result = await self.session.execute(query)
            await self.session.commit()
            return result.rowcount > 0
        except Exception as e:
            logger.error("Failed to update job progress", job_id=job_id, error=str(e))
            await self.session.rollback()
            raise DatabaseError(f"Job progress update failed: {str(e)}")
--- a/services/data/app/schemas/sales.py
+++ b/services/data/app/schemas/sales.py
@@ -3,7 +3,7 @@
 # ================================================================
 """Sales data schemas"""
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator
 from datetime import datetime
 from typing import Optional, List, Dict, Any
 from uuid import UUID
@@ -20,7 +20,8 @@ class SalesDataCreate(BaseModel):
    source: str = Field(default="manual", max_length=50)
    notes: Optional[str] = Field(None, max_length=500)
-    @validator('product_name')
+    @field_validator('product_name')
    @classmethod
    def normalize_product_name(cls, v):
        return v.strip().lower()
--- a/services/data/app/schemas/traffic.py
+++ b/services/data/app/schemas/traffic.py
@@ -3,7 +3,7 @@
 # ================================================================
 """Traffic data schemas"""
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator
 from datetime import datetime
 from typing import Optional, List
 from uuid import UUID
@@ -14,7 +14,7 @@ class TrafficDataBase(BaseModel):
    date: datetime = Field(..., description="Date and time of traffic measurement")
    traffic_volume: Optional[int] = Field(None, ge=0, description="Vehicles per hour")
    pedestrian_count: Optional[int] = Field(None, ge=0, description="Pedestrians per hour")
-    congestion_level: Optional[str] = Field(None, regex="^(low|medium|high)$", description="Traffic congestion level")
+    congestion_level: Optional[str] = Field(None, pattern="^(low|medium|high)$", description="Traffic congestion level")
    average_speed: Optional[float] = Field(None, ge=0, le=200, description="Average speed in km/h")
    source: str = Field("madrid_opendata", max_length=50, description="Data source")
    raw_data: Optional[str] = Field(None, description="Raw data from source")
@@ -27,7 +27,7 @@ class TrafficDataUpdate(BaseModel):
    """Schema for updating traffic data"""
    traffic_volume: Optional[int] = Field(None, ge=0)
    pedestrian_count: Optional[int] = Field(None, ge=0)
-    congestion_level: Optional[str] = Field(None, regex="^(low|medium|high)$")
+    congestion_level: Optional[str] = Field(None, pattern="^(low|medium|high)$")
    average_speed: Optional[float] = Field(None, ge=0, le=200)
    raw_data: Optional[str] = None
@@ -37,7 +37,8 @@ class TrafficDataResponse(TrafficDataBase):
    created_at: datetime = Field(..., description="Creation timestamp")
    updated_at: datetime = Field(..., description="Last update timestamp")
-    @validator('id', pre=True)
+    @field_validator('id', mode='before')
    @classmethod
    def convert_uuid_to_string(cls, v):
        if isinstance(v, UUID):
            return str(v)
--- a/services/data/app/schemas/weather.py
+++ b/services/data/app/schemas/weather.py
@@ -3,7 +3,7 @@
 # ================================================================
 """Weather data schemas"""
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, field_validator
 from datetime import datetime
 from typing import Optional, List
 from uuid import UUID
@@ -41,7 +41,8 @@ class WeatherDataResponse(WeatherDataBase):
    created_at: datetime = Field(..., description="Creation timestamp")
    updated_at: datetime = Field(..., description="Last update timestamp")
-    @validator('id', pre=True)
+    @field_validator('id', mode='before')
    @classmethod
    def convert_uuid_to_string(cls, v):
        if isinstance(v, UUID):
            return str(v)
@@ -76,7 +77,8 @@ class WeatherForecastResponse(WeatherForecastBase):
    created_at: datetime = Field(..., description="Creation timestamp")
    updated_at: datetime = Field(..., description="Last update timestamp")
-    @validator('id', pre=True)
+    @field_validator('id', mode='before')
    @classmethod
    def convert_uuid_to_string(cls, v):
        if isinstance(v, UUID):
            return str(v)
--- a/services/data/app/services/traffic_service.py
+++ b/services/data/app/services/traffic_service.py
@@ -1,122 +1,283 @@
 # ================================================================
-# services/data/app/services/traffic_service.py - FIXED VERSION
+# services/data/app/services/traffic_service.py
 # ================================================================
-"""Traffic data service with improved error handling"""
+"""
 Abstracted Traffic Service - Universal interface for traffic data across multiple cities
 """
-from typing import List, Dict, Any, Optional
+import asyncio
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Dict, List, Any, Optional, Tuple
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select, and_
 import structlog
 from app.external.apis.traffic import UniversalTrafficClient
 from app.models.traffic import TrafficData
-from app.external.madrid_opendata import MadridOpenDataClient
+from app.core.performance import (
-from app.schemas.external import TrafficDataResponse
+    async_cache, 
-
+    monitor_performance, 
-import uuid
+    global_connection_pool,
    global_performance_monitor,
    batch_process
 )
 logger = structlog.get_logger()
 class TrafficService:
    """
    Abstracted traffic service providing unified interface for traffic data
    Routes requests to appropriate city-specific clients automatically
    """
    def __init__(self):
-        self.madrid_client = MadridOpenDataClient()
+        self.universal_client = UniversalTrafficClient()
        self.logger = structlog.get_logger(__name__)
-    async def get_current_traffic(self, latitude: float, longitude: float) -> Optional[TrafficDataResponse]:
+    @async_cache(ttl=300)  # Cache for 5 minutes
-        """Get current traffic data for location"""
+    @monitor_performance(monitor=global_performance_monitor)
    async def get_current_traffic(
        self, 
        latitude: float, 
        longitude: float, 
        tenant_id: Optional[str] = None
    ) -> Optional[Dict[str, Any]]:
        """
        Get current traffic data for any supported location
        Args:
            latitude: Query location latitude
            longitude: Query location longitude
            tenant_id: Optional tenant identifier for logging/analytics
        Returns:
            Dict with current traffic data or None if not available
        """
        try:
-            logger.debug("Getting current traffic", lat=latitude, lon=longitude)
+            self.logger.info("Getting current traffic data", 
-            traffic_data = await self.madrid_client.get_current_traffic(latitude, longitude)
+                           lat=latitude, lon=longitude, tenant_id=tenant_id)
            # Delegate to universal client
            traffic_data = await self.universal_client.get_current_traffic(latitude, longitude)
            if traffic_data:
-                logger.debug("Traffic data received", source=traffic_data.get('source'))
+                # Add service metadata
                traffic_data['service_metadata'] = {
                    'request_timestamp': datetime.now().isoformat(),
                    'tenant_id': tenant_id,
                    'service_version': '2.0',
                    'query_location': {'latitude': latitude, 'longitude': longitude}
                }
-                # Validate and clean traffic data before creating response
+                self.logger.info("Successfully retrieved current traffic data",
-                # Use keyword arguments instead of unpacking
+                               lat=latitude, lon=longitude, 
-                response = TrafficDataResponse(
+                               source=traffic_data.get('source', 'unknown'))
                    date=traffic_data.get("date", datetime.now()),
                    traffic_volume=int(traffic_data.get("traffic_volume", 100)),
                    pedestrian_count=int(traffic_data.get("pedestrian_count", 150)),
                    congestion_level=str(traffic_data.get("congestion_level", "medium")),
                    average_speed=float(traffic_data.get("average_speed", 25.0)),  # Fixed: use float, not int
                    source=str(traffic_data.get("source", "unknown"))
                )
-                logger.debug("Successfully created traffic response", 
+                return traffic_data
                           traffic_volume=response.traffic_volume,
                           congestion_level=response.congestion_level)
                return response
            else:
-                logger.warning("No traffic data received from Madrid client")
+                self.logger.warning("No current traffic data available",
                                  lat=latitude, lon=longitude)
                return None
        except Exception as e:
-            logger.error("Failed to get current traffic", error=str(e), lat=latitude, lon=longitude)
+            self.logger.error("Error getting current traffic data",
-            # Log the full traceback for debugging
+                            lat=latitude, lon=longitude, error=str(e))
            import traceback
            logger.error("Traffic service traceback", traceback=traceback.format_exc())
            return None
-    async def get_historical_traffic(self, 
+    @async_cache(ttl=1800)  # Cache for 30 minutes (historical data changes less frequently)
-                                   latitude: float, 
+    @monitor_performance(monitor=global_performance_monitor)
-                                   longitude: float, 
+    async def get_historical_traffic(
-                                   start_date: datetime, 
+        self, 
-                                   end_date: datetime,
+        latitude: float, 
-                                   db: AsyncSession) -> List[TrafficDataResponse]:
+        longitude: float, 
-        """Get historical traffic data with enhanced storage for re-training"""
+        start_date: datetime, 
        end_date: datetime,
        tenant_id: Optional[str] = None,
        db: Optional[AsyncSession] = None
    ) -> List[Dict[str, Any]]:
        """
        Get historical traffic data for any supported location with database storage
        Args:
            latitude: Query location latitude
            longitude: Query location longitude
            start_date: Start date for historical data
            end_date: End date for historical data
            tenant_id: Optional tenant identifier
            db: Optional database session for storage
        Returns:
            List of historical traffic data dictionaries
        """
        try:
-            logger.debug("Getting historical traffic", 
+            self.logger.info("Getting historical traffic data", 
-                        lat=latitude, lon=longitude, 
+                           lat=latitude, lon=longitude, 
-                        start=start_date, end=end_date)
+                           start=start_date, end=end_date, tenant_id=tenant_id)
            # Validate date range
            if start_date >= end_date:
                self.logger.warning("Invalid date range", start=start_date, end=end_date)
                return []
            # Check database first
            location_id = f"{latitude:.4f},{longitude:.4f}"
            stmt = select(TrafficData).where(
                and_(
                    TrafficData.location_id == location_id,
                    TrafficData.date >= start_date,
                    TrafficData.date <= end_date
                )
            ).order_by(TrafficData.date)
-            result = await db.execute(stmt)
+            # Check database first if session provided
-            db_records = result.scalars().all()
+            if db:
                stmt = select(TrafficData).where(
                    and_(
                        TrafficData.location_id == location_id,
                        TrafficData.date >= start_date,
                        TrafficData.date <= end_date
                    )
                ).order_by(TrafficData.date)
-            if db_records:
+                result = await db.execute(stmt)
-                logger.debug("Historical traffic data found in database", count=len(db_records))
+                db_records = result.scalars().all()
                return [TrafficDataResponse(
                    date=record.date,
                    traffic_volume=record.traffic_volume,
                    pedestrian_count=record.pedestrian_count,
                    congestion_level=record.congestion_level,
                    average_speed=record.average_speed,
                    source=record.source
                ) for record in db_records]
-            # If not in database, fetch from API and store
+                if db_records:
-            logger.debug("Fetching historical data from MADRID OPEN DATA")
+                    self.logger.info("Historical traffic data found in database", 
-            traffic_data = await self.madrid_client.get_historical_traffic(
+                                   count=len(db_records))
                    return [self._convert_db_record_to_dict(record) for record in db_records]
            # Delegate to universal client
            traffic_data = await self.universal_client.get_historical_traffic(
                latitude, longitude, start_date, end_date
            )
            if traffic_data:
-                # Enhanced storage with better error handling and validation
+                # Add service metadata to each record
-                stored_count = await self._store_traffic_data_batch(
+                for record in traffic_data:
-                    traffic_data, location_id, db
+                    record['service_metadata'] = {
-                )
+                        'request_timestamp': datetime.now().isoformat(),
-                logger.info("Traffic data stored for re-training", 
+                        'tenant_id': tenant_id,
-                           fetched=len(traffic_data), stored=stored_count, location=location_id)
+                        'service_version': '2.0',
                        'query_location': {'latitude': latitude, 'longitude': longitude},
                        'date_range': {
                            'start': start_date.isoformat(),
                            'end': end_date.isoformat()
                        }
                    }
-                return [TrafficDataResponse(**item) for item in traffic_data]
+                # Store in database if session provided
                if db:
                    stored_count = await self._store_traffic_data_batch(
                        traffic_data, location_id, db
                    )
                    self.logger.info("Traffic data stored for re-training", 
                                   fetched=len(traffic_data), stored=stored_count, 
                                   location=location_id)
                self.logger.info("Successfully retrieved historical traffic data",
                               lat=latitude, lon=longitude, records=len(traffic_data))
                return traffic_data
            else:
-                logger.warning("No historical traffic data received")
+                self.logger.info("No historical traffic data available",
                                lat=latitude, lon=longitude)
                return []
        except Exception as e:
-            logger.error("Failed to get historical traffic", error=str(e))
+            self.logger.error("Error getting historical traffic data",
                            lat=latitude, lon=longitude, error=str(e))
            return []
    def _convert_db_record_to_dict(self, record: TrafficData) -> Dict[str, Any]:
        """Convert database record to dictionary format"""
        return {
            'date': record.date,
            'traffic_volume': record.traffic_volume,
            'pedestrian_count': record.pedestrian_count,
            'congestion_level': record.congestion_level,
            'average_speed': record.average_speed,
            'source': record.source,
            'location_id': record.location_id,
            'raw_data': record.raw_data
        }
    async def get_traffic_events(
        self, 
        latitude: float, 
        longitude: float, 
        radius_km: float = 5.0,
        tenant_id: Optional[str] = None
    ) -> List[Dict[str, Any]]:
        """
        Get traffic events and incidents for any supported location
        Args:
            latitude: Query location latitude
            longitude: Query location longitude
            radius_km: Search radius in kilometers
            tenant_id: Optional tenant identifier
        Returns:
            List of traffic events
        """
        try:
            self.logger.info("Getting traffic events", 
                           lat=latitude, lon=longitude, radius=radius_km, tenant_id=tenant_id)
            # Delegate to universal client
            events = await self.universal_client.get_events(latitude, longitude, radius_km)
            # Add metadata to events
            for event in events:
                event['service_metadata'] = {
                    'request_timestamp': datetime.now().isoformat(),
                    'tenant_id': tenant_id,
                    'service_version': '2.0',
                    'query_location': {'latitude': latitude, 'longitude': longitude},
                    'search_radius_km': radius_km
                }
            self.logger.info("Retrieved traffic events", 
                           lat=latitude, lon=longitude, events=len(events))
            return events
        except Exception as e:
            self.logger.error("Error getting traffic events",
                            lat=latitude, lon=longitude, error=str(e))
            return []
    def get_location_info(self, latitude: float, longitude: float) -> Dict[str, Any]:
        """
        Get information about traffic data availability for location
        Args:
            latitude: Query location latitude
            longitude: Query location longitude
        Returns:
            Dict with location support information
        """
        try:
            info = self.universal_client.get_location_info(latitude, longitude)
            # Add service layer information
            info['service_layer'] = {
                'version': '2.0',
                'abstraction_level': 'universal',
                'supported_operations': [
                    'current_traffic',
                    'historical_traffic', 
                    'traffic_events',
                    'bulk_requests'
                ]
            }
            return info
        except Exception as e:
            self.logger.error("Error getting location info", 
                            lat=latitude, lon=longitude, error=str(e))
            return {
                'supported': False,
                'error': str(e),
                'service_layer': {'version': '2.0'}
            }
    async def store_traffic_data(self, 
                               latitude: float, 
                               longitude: float, 
@@ -176,7 +337,8 @@ class TrafficService:
            else:
                existing_dates = set()
-            # Store only new records
+            # Prepare batch of new records for bulk insert
            batch_records = []
            for data in traffic_data:
                try:
                    record_date = data.get('date')
@@ -188,32 +350,41 @@ class TrafficService:
                        logger.warning("Invalid traffic data, skipping", data=data)
                        continue
-                    traffic_record = TrafficData(
+                    # Prepare record data for bulk insert
-                        location_id=location_id,
+                    record_data = {
-                        date=record_date,
+                        'location_id': location_id,
-                        traffic_volume=data.get('traffic_volume'),
+                        'date': record_date,
-                        pedestrian_count=data.get('pedestrian_count'),
+                        'traffic_volume': data.get('traffic_volume'),
-                        congestion_level=data.get('congestion_level'),
+                        'pedestrian_count': data.get('pedestrian_count'),
-                        average_speed=data.get('average_speed'),
+                        'congestion_level': data.get('congestion_level'),
-                        source=data.get('source', 'madrid_opendata'),
+                        'average_speed': data.get('average_speed'),
-                        raw_data=str(data)
+                        'source': data.get('source', 'madrid_opendata'),
-                    )
+                        'raw_data': str(data)
-                    
+                    }
-                    db.add(traffic_record)
+                    batch_records.append(record_data)
                    stored_count += 1
                    # Commit in batches to avoid memory issues
                    if stored_count % 100 == 0:
                        await db.commit()
                        logger.debug(f"Committed batch of {stored_count} records")
                except Exception as record_error:
-                    logger.warning("Failed to store individual traffic record", 
+                    logger.warning("Failed to prepare traffic record", 
                                 error=str(record_error), data=data)
                    continue
-            # Final commit
+            # Use efficient bulk insert instead of individual records
-            await db.commit()
+            if batch_records:
                # Process in chunks to avoid memory issues
                chunk_size = 5000
                for i in range(0, len(batch_records), chunk_size):
                    chunk = batch_records[i:i + chunk_size]
                    # Use SQLAlchemy bulk insert for maximum performance
                    await db.execute(
                        TrafficData.__table__.insert(),
                        chunk
                    )
                    await db.commit()
                    stored_count += len(chunk)
                    logger.debug(f"Bulk inserted {len(chunk)} records (total: {stored_count})")
            logger.info(f"Successfully stored {stored_count} traffic records for location {location_id}")
        except Exception as e:
--- a/services/data/tests/test_madrid_opendata.py
+++ b/services/data/tests/test_madrid_opendata.py
@@ -1,405 +0,0 @@
 #!/usr/bin/env python3
 """
 Updated Madrid Historical Traffic test for pytest inside Docker
 Configured for June 2025 data availability (last available historical data)
 """
 import pytest
 import asyncio
 from datetime import datetime, timedelta
 from typing import List, Dict, Any
 # Import from the actual service
 from app.external.madrid_opendata import MadridOpenDataClient
 from app.core.config import settings
 import structlog
 # Configure pytest for async
 pytestmark = pytest.mark.asyncio
 # Use actual logger
 logger = structlog.get_logger()
 class TestMadridTrafficInside:
    """Test class for Madrid traffic functionality inside Docker"""
    @pytest.fixture
    def client(self):
        """Create Madrid client for testing"""
        return MadridOpenDataClient()
    @pytest.fixture
    def madrid_coords(self):
        """Madrid center coordinates"""
        return 40.4168, -3.7038
    @pytest.fixture
    def june_2025_dates(self):
        """Date ranges for June 2025 (last available historical data)"""
        return {
            "quick": {
                "start": datetime(2025, 6, 1, 0, 0),
                "end": datetime(2025, 6, 1, 6, 0)  # 6 hours on June 1st
            },
            "one_day": {
                "start": datetime(2025, 6, 15, 0, 0),  # Mid-June
                "end": datetime(2025, 6, 16, 0, 0)    # One full day
            },
            "three_days": {
                "start": datetime(2025, 6, 10, 0, 0),
                "end": datetime(2025, 6, 13, 0, 0)    # 3 days in June
            },
            "recent_synthetic": {
                "start": datetime.now() - timedelta(hours=6),
                "end": datetime.now()  # Recent data (will be synthetic)
            }
        }
    async def test_quick_historical_traffic_june2025(self, client, madrid_coords, june_2025_dates):
        """Test quick historical traffic data from June 2025"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["quick"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        print(f"\n=== Quick Test (June 2025 - 6 hours) ===")
        print(f"Location: {lat}, {lon}")
        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
        print(f"Note: Testing with June 2025 data (last available historical month)")
        # Test the function
        execution_start = datetime.now()
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        execution_time = (datetime.now() - execution_start).total_seconds()
        print(f"⏱️  Execution time: {execution_time:.2f} seconds")
        print(f"📊 Records returned: {len(result)}")
        # Assertions
        assert isinstance(result, list), "Result should be a list"
        assert len(result) > 0, "Should return at least some records"
        assert execution_time < 5000, "Should execute in reasonable time (allowing for ZIP download)"
        # Check first record structure
        if result:
            sample = result[0]
            print(f"📋 Sample record keys: {list(sample.keys())}")
            print(f"📡 Data source: {sample.get('source', 'unknown')}")
            # Required fields
            required_fields = ['date', 'traffic_volume', 'congestion_level', 'average_speed', 'source']
            for field in required_fields:
                assert field in sample, f"Missing required field: {field}"
            # Data validation
            assert isinstance(sample['traffic_volume'], int), "Traffic volume should be int"
            assert 0 <= sample['traffic_volume'] <= 1000, "Traffic volume should be reasonable"
            assert sample['congestion_level'] in ['low', 'medium', 'high', 'blocked'], "Invalid congestion level"
            assert 5 <= sample['average_speed'] <= 100, "Speed should be reasonable"
            assert isinstance(sample['date'], datetime), "Date should be datetime object"
            # Check if we got real Madrid data or synthetic
            if sample['source'] == 'madrid_opendata_zip':
                print(f"🎉 SUCCESS: Got real Madrid historical data from ZIP!")
            else:
                print(f"ℹ️  Got synthetic data (real data may not be available)")
            print(f"✅ All validations passed")
    async def test_one_day_june2025(self, client, madrid_coords, june_2025_dates):
        """Test one day of June 2025 historical traffic data"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["one_day"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        print(f"\n=== One Day Test (June 15, 2025) ===")
        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        print(f"📊 Records returned: {len(result)}")
        # Should have roughly 24 records (one per hour)
        assert len(result) >= 20, "Should have at least 20 hourly records for one day"
        assert len(result) <= 5000, "Should not have more than 30 records for one day"
        # Check data source
        if result:
            sources = set(r['source'] for r in result)
            print(f"📡 Data sources: {', '.join(sources)}")
            # If we got real data, check for realistic measurement point IDs
            real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
            if real_data_records:
                point_ids = set(r['measurement_point_id'] for r in real_data_records)
                print(f"🏷️  Real measurement points found: {len(point_ids)}")
                print(f"   Sample IDs: {list(point_ids)[:3]}")
        # Check traffic patterns
        if len(result) >= 24:
            # Find rush hour records (7-9 AM, 6-8 PM)
            rush_hour_records = [r for r in result if 7 <= r['date'].hour <= 9 or 18 <= r['date'].hour <= 20]
            night_records = [r for r in result if r['date'].hour <= 6 or r['date'].hour >= 22]
            if rush_hour_records and night_records:
                avg_rush_traffic = sum(r['traffic_volume'] for r in rush_hour_records) / len(rush_hour_records)
                avg_night_traffic = sum(r['traffic_volume'] for r in night_records) / len(night_records)
                print(f"📈 Rush hour avg traffic: {avg_rush_traffic:.1f}")
                print(f"🌙 Night avg traffic: {avg_night_traffic:.1f}")
                # Rush hour should typically have more traffic than night
                if avg_rush_traffic > avg_night_traffic:
                    print(f"✅ Traffic patterns look realistic")
                else:
                    print(f"⚠️  Traffic patterns unusual (not necessarily wrong)")
    async def test_three_days_june2025(self, client, madrid_coords, june_2025_dates):
        """Test three days of June 2025 historical traffic data"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["three_days"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        print(f"\n=== Three Days Test (June 10-13, 2025) ===")
        print(f"Date range: {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}")
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        print(f"📊 Records returned: {len(result)}")
        # Should have roughly 72 records (24 hours * 3 days)
        assert len(result) >= 60, "Should have at least 60 records for 3 days"
        assert len(result) <= 5000, "Should not have more than 90 records for 3 days"
        # Check data sources
        sources = set(r['source'] for r in result)
        print(f"📡 Data sources: {', '.join(sources)}")
        # Calculate statistics
        traffic_volumes = [r['traffic_volume'] for r in result]
        speeds = [r['average_speed'] for r in result]
        avg_traffic = sum(traffic_volumes) / len(traffic_volumes)
        max_traffic = max(traffic_volumes)
        min_traffic = min(traffic_volumes)
        avg_speed = sum(speeds) / len(speeds)
        print(f"📈 Statistics:")
        print(f"   Average traffic: {avg_traffic:.1f}")
        print(f"   Max traffic: {max_traffic}")
        print(f"   Min traffic: {min_traffic}")
        print(f"   Average speed: {avg_speed:.1f} km/h")
        # Analyze by data source
        real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
        synthetic_records = [r for r in result if r['source'] != 'madrid_opendata_zip']
        print(f"🔍 Data breakdown:")
        print(f"   Real Madrid data: {len(real_data_records)} records")
        print(f"   Synthetic data: {len(synthetic_records)} records")
        if real_data_records:
            # Show measurement points from real data
            real_points = set(r['measurement_point_id'] for r in real_data_records)
            print(f"   Real measurement points: {len(real_points)}")
        # Sanity checks
        assert 10 <= avg_traffic <= 500, "Average traffic should be reasonable"
        assert 10 <= avg_speed <= 60, "Average speed should be reasonable"
        assert max_traffic >= avg_traffic, "Max should be >= average"
        assert min_traffic <= avg_traffic, "Min should be <= average"
    async def test_recent_vs_historical_data(self, client, madrid_coords, june_2025_dates):
        """Compare recent data (synthetic) vs June 2025 data (potentially real)"""
        lat, lon = madrid_coords
        print(f"\n=== Recent vs Historical Data Comparison ===")
        # Test recent data (should be synthetic)
        recent_range = june_2025_dates["recent_synthetic"]
        recent_result = await client.get_historical_traffic(
            lat, lon, recent_range["start"], recent_range["end"]
        )
        # Test June 2025 data (potentially real)
        june_range = june_2025_dates["quick"]
        june_result = await client.get_historical_traffic(
            lat, lon, june_range["start"], june_range["end"]
        )
        print(f"📊 Recent data: {len(recent_result)} records")
        print(f"📊 June 2025 data: {len(june_result)} records")
        if recent_result:
            recent_sources = set(r['source'] for r in recent_result)
            print(f"📡 Recent sources: {', '.join(recent_sources)}")
        if june_result:
            june_sources = set(r['source'] for r in june_result)
            print(f"📡 June sources: {', '.join(june_sources)}")
            # Check if we successfully got real data from June
            if 'madrid_opendata_zip' in june_sources:
                print(f"🎉 SUCCESS: Real Madrid data successfully fetched from June 2025!")
                # Show details of real data
                real_records = [r for r in june_result if r['source'] == 'madrid_opendata_zip']
                if real_records:
                    sample = real_records[0]
                    print(f"📋 Real data sample:")
                    print(f"   Date: {sample['date']}")
                    print(f"   Traffic volume: {sample['traffic_volume']}")
                    print(f"   Measurement point: {sample['measurement_point_id']}")
                    print(f"   Point name: {sample.get('measurement_point_name', 'N/A')}")
            else:
                print(f"ℹ️  June data is synthetic (real ZIP may not be accessible)")
    async def test_madrid_zip_month_code(self, client):
        """Test the month code calculation for Madrid ZIP files"""
        print(f"\n=== Madrid ZIP Month Code Test ===")
        # Test the month code calculation function
        test_cases = [
            (2025, 6, 145),  # Known: June 2025 = 145
            (2025, 5, 144),  # Known: May 2025 = 144
            (2025, 4, 143),  # Known: April 2025 = 143
            (2025, 7, 146),  # Predicted: July 2025 = 146
        ]
        for year, month, expected_code in test_cases:
            if hasattr(client, '_calculate_madrid_month_code'):
                calculated_code = client._calculate_madrid_month_code(year, month)
                status = "✅" if calculated_code == expected_code else "⚠️"
                print(f"{status} {year}-{month:02d}: Expected {expected_code}, Got {calculated_code}")
                # Generate ZIP URL
                if calculated_code:
                    zip_url = f"https://datos.madrid.es/egob/catalogo/208627-{calculated_code}-transporte-ptomedida-historico.zip"
                    print(f"   ZIP URL: {zip_url}")
            else:
                print(f"⚠️  Month code calculation function not available")
    async def test_edge_case_large_date_range(self, client, madrid_coords):
        """Test edge case: date range too large"""
        lat, lon = madrid_coords
        start_time = datetime(2025, 1, 1)  # 6+ months range
        end_time = datetime(2025, 7, 1)
        print(f"\n=== Edge Case: Large Date Range ===")
        print(f"Testing 6-month range: {start_time.date()} to {end_time.date()}")
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        print(f"📊 Records for 6-month range: {len(result)}")
        # Should return empty list for ranges > 90 days
        assert len(result) == 0, "Should return empty list for date ranges > 90 days"
        print(f"✅ Correctly handled large date range")
    async def test_edge_case_invalid_coordinates(self, client):
        """Test edge case: invalid coordinates"""
        print(f"\n=== Edge Case: Invalid Coordinates ===")
        start_time = datetime(2025, 6, 1)
        end_time = datetime(2025, 6, 1, 6, 0)
        # Test with invalid coordinates
        result = await client.get_historical_traffic(999.0, 999.0, start_time, end_time)
        print(f"📊 Records for invalid coords: {len(result)}")
        # Should either return empty list or synthetic data
        # The function should not crash
        assert isinstance(result, list), "Should return list even with invalid coords"
        print(f"✅ Handled invalid coordinates gracefully")
    async def test_real_madrid_zip_access(self, client):
        """Test if we can access the actual Madrid ZIP files"""
        print(f"\n=== Real Madrid ZIP Access Test ===")
        # Test the known ZIP URLs you provided
        test_urls = [
            "https://datos.madrid.es/egob/catalogo/208627-145-transporte-ptomedida-historico.zip",  # June 2025
            "https://datos.madrid.es/egob/catalogo/208627-144-transporte-ptomedida-historico.zip",  # May 2025
            "https://datos.madrid.es/egob/catalogo/208627-143-transporte-ptomedida-historico.zip",  # April 2025
        ]
        for i, url in enumerate(test_urls):
            month_name = ["June 2025", "May 2025", "April 2025"][i]
            print(f"\nTesting {month_name}: {url}")
            try:
                if hasattr(client, '_fetch_historical_zip'):
                    zip_data = await client._fetch_historical_zip(url)
                    if zip_data:
                        print(f"✅ Successfully fetched ZIP: {len(zip_data)} bytes")
                        # Try to inspect ZIP contents
                        try:
                            import zipfile
                            from io import BytesIO
                            with zipfile.ZipFile(BytesIO(zip_data), 'r') as zip_file:
                                files = zip_file.namelist()
                                csv_files = [f for f in files if f.endswith('.csv')]
                                print(f"📁 ZIP contains {len(files)} files, {len(csv_files)} CSV files")
                                if csv_files:
                                    print(f"   CSV files: {csv_files[:2]}{'...' if len(csv_files) > 2 else ''}")
                        except Exception as e:
                            print(f"⚠️  Could not inspect ZIP contents: {e}")
                    else:
                        print(f"❌ Failed to fetch ZIP")
                else:
                    print(f"⚠️  ZIP fetch function not available")
            except Exception as e:
                print(f"❌ Error testing ZIP access: {e}")
 # Additional standalone test functions for manual running
 async def run_manual_test():
    """Manual test function that can be run directly"""
    print("="*60)
    print("MADRID TRAFFIC TEST - JUNE 2025 DATA")
    print("="*60)
    client = MadridOpenDataClient()
    madrid_lat, madrid_lon = 40.4168, -3.7038
    # Test with June 2025 data (last available)
    start_time = datetime(2025, 6, 15, 14, 0)  # June 15, 2025 at 2 PM
    end_time = datetime(2025, 6, 15, 18, 0)    # Until 6 PM (4 hours)
    print(f"\nTesting June 15, 2025 data (2 PM - 6 PM)...")
    print(f"This should include afternoon traffic patterns")
    result = await client.get_historical_traffic(madrid_lat, madrid_lon, start_time, end_time)
    print(f"Result: {len(result)} records")
    if result:
        sources = set(r['source'] for r in result)
        print(f"Data sources: {', '.join(sources)}")
        if 'madrid_opendata_zip' in sources:
            print(f"🎉 Successfully got real Madrid data!")
        sample = result[0]
        print(f"\nSample record:")
        for key, value in sample.items():
            if key == "date":
                print(f"  {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
            else:
                print(f"  {key}: {value}")
    print(f"\n✅ Manual test completed!")
 if __name__ == "__main__":
    # If run directly, execute manual test
    asyncio.run(run_manual_test())
--- a/services/training/app/services/training_orchestrator.py
+++ b/services/training/app/services/training_orchestrator.py
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Any, Tuple
 from dataclasses import dataclass
 import asyncio
-import logging
+import structlog
 from concurrent.futures import ThreadPoolExecutor
 from datetime import timezone
 import pandas as pd
@@ -24,7 +24,7 @@ from app.services.messaging import (
    publish_job_failed
 )
-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()
@dataclass
 class TrainingDataSet:
@@ -39,15 +39,14 @@ class TrainingDataOrchestrator:
    """
    Enhanced orchestrator for data collection from multiple sources.
    Ensures date alignment, handles data source constraints, and prepares data for ML training.
    Uses the new abstracted traffic service layer for multi-city support.
    """
    def __init__(self, 
                 madrid_client=None, 
                 weather_client=None,
                 date_alignment_service: DateAlignmentService = None):
        self.data_client = DataClient()
        self.date_alignment_service = date_alignment_service or DateAlignmentService()
-        self.max_concurrent_requests = 3
+        self.max_concurrent_requests = 5  # Increased for better performance
    async def prepare_training_data(
        self,
@@ -281,11 +280,11 @@ class TrainingDataOrchestrator:
            )
            tasks.append(("weather", weather_task))
-        # Traffic data collection
+        # Enhanced Traffic data collection (supports multiple cities)
        if DataSourceType.MADRID_TRAFFIC in aligned_range.available_sources:
-            logger.info(f"🚛 Traffic data source available, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
+            logger.info(f"🚛 Traffic data source available for multiple cities, creating collection task for date range: {aligned_range.start} to {aligned_range.end}")
            traffic_task = asyncio.create_task(
-                self._collect_traffic_data_with_timeout(lat, lon, aligned_range, tenant_id)
+                self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
            )
            tasks.append(("traffic", traffic_task))
        else:
@@ -353,28 +352,31 @@ class TrainingDataOrchestrator:
            logger.warning(f"Weather data collection failed: {e}, using synthetic data")
            return self._generate_synthetic_weather_data(aligned_range)
-    async def _collect_traffic_data_with_timeout(
+    async def _collect_traffic_data_with_timeout_enhanced(
        self,
        lat: float,
        lon: float,
        aligned_range: AlignedDateRange,
        tenant_id: str 
    ) -> List[Dict[str, Any]]:
-        """Collect traffic data with enhanced storage and retrieval for re-training"""
+        """
        Enhanced traffic data collection with multi-city support and improved storage
        Uses the new abstracted traffic service layer
        """
        try:
-
+            # Double-check constraints before making request
            # Double-check Madrid constraint before making request
            constraint_violated = self.date_alignment_service.check_madrid_current_month_constraint(aligned_range.end)
            if constraint_violated:
-                logger.warning(f"🚫 Madrid current month constraint violation: end_date={aligned_range.end}, no traffic data available")
+                logger.warning(f"🚫 Current month constraint violation: end_date={aligned_range.end}, no traffic data available")
                return []
            else:
-                logger.info(f"✅ Madrid constraint passed: end_date={aligned_range.end}, proceeding with traffic data request")
+                logger.info(f"✅ Date constraints passed: end_date={aligned_range.end}, proceeding with traffic data request")
            start_date_str = aligned_range.start.isoformat()
            end_date_str = aligned_range.end.isoformat()
-            # Fetch traffic data - this will automatically store it for future re-training
+            # Enhanced: Fetch traffic data using new abstracted service
            # This automatically detects the appropriate city and uses the right client
            traffic_data = await self.data_client.fetch_traffic_data(
                tenant_id=tenant_id,
                start_date=start_date_str,
@@ -382,39 +384,82 @@ class TrainingDataOrchestrator:
                latitude=lat,
                longitude=lon)
-            # Validate traffic data
+            # Enhanced validation including pedestrian inference data
-            if self._validate_traffic_data(traffic_data):
+            if self._validate_traffic_data_enhanced(traffic_data):
-                logger.info(f"Collected and stored {len(traffic_data)} valid traffic records for re-training")
+                logger.info(f"Collected and stored {len(traffic_data)} valid enhanced traffic records for re-training")
-                # Log storage success for audit purposes
+                # Log storage success with enhanced metadata
-                self._log_traffic_data_storage(lat, lon, aligned_range, len(traffic_data))
+                self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, len(traffic_data), traffic_data)
                return traffic_data
            else:
-                logger.warning("Invalid traffic data received")
+                logger.warning("Invalid enhanced traffic data received")
                return []
        except asyncio.TimeoutError:
-            logger.warning(f"Traffic data collection timed out")
+            logger.warning(f"Enhanced traffic data collection timed out")
            return []
        except Exception as e:
-            logger.warning(f"Traffic data collection failed: {e}")
+            logger.warning(f"Enhanced traffic data collection failed: {e}")
            return []
    # Keep original method for backwards compatibility
    async def _collect_traffic_data_with_timeout(
        self,
        lat: float,
        lon: float,
        aligned_range: AlignedDateRange,
        tenant_id: str 
    ) -> List[Dict[str, Any]]:
        """Legacy traffic data collection method - redirects to enhanced version"""
        return await self._collect_traffic_data_with_timeout_enhanced(lat, lon, aligned_range, tenant_id)
    def _log_enhanced_traffic_data_storage(self, 
                                          lat: float, 
                                          lon: float, 
                                          aligned_range: AlignedDateRange, 
                                          record_count: int,
                                          traffic_data: List[Dict[str, Any]]):
        """Enhanced logging for traffic data storage with detailed metadata"""
        # Analyze the stored data for additional insights
        cities_detected = set()
        has_pedestrian_data = 0
        data_sources = set()
        districts_covered = set()
        for record in traffic_data:
            if 'city' in record and record['city']:
                cities_detected.add(record['city'])
            if 'pedestrian_count' in record and record['pedestrian_count'] is not None:
                has_pedestrian_data += 1
            if 'source' in record and record['source']:
                data_sources.add(record['source'])
            if 'district' in record and record['district']:
                districts_covered.add(record['district'])
        logger.info(
            "Enhanced traffic data stored for re-training",
            location=f"{lat:.4f},{lon:.4f}",
            date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
            records_stored=record_count,
            cities_detected=list(cities_detected),
            pedestrian_inference_coverage=f"{has_pedestrian_data}/{record_count}",
            data_sources=list(data_sources),
            districts_covered=list(districts_covered),
            storage_timestamp=datetime.now().isoformat(),
            purpose="enhanced_model_training_and_retraining",
            architecture_version="2.0_abstracted"
        )
    def _log_traffic_data_storage(self, 
                                 lat: float, 
                                 lon: float, 
                                 aligned_range: AlignedDateRange, 
                                 record_count: int):
-        """Log traffic data storage for audit and re-training tracking"""
+        """Legacy logging method - redirects to enhanced version"""
-        logger.info(
+        # Create minimal traffic data structure for enhanced logging
-            "Traffic data stored for re-training",
+        minimal_traffic_data = [{"city": "madrid", "source": "legacy"}] * min(record_count, 1)
-            location=f"{lat:.4f},{lon:.4f}",
+        self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, record_count, minimal_traffic_data)
            date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
            records_stored=record_count,
            storage_timestamp=datetime.now().isoformat(),
            purpose="model_training_and_retraining"
        )
    async def retrieve_stored_traffic_for_retraining(
        self,
@@ -491,32 +536,73 @@ class TrainingDataOrchestrator:
        return is_valid
-    def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
+    def _validate_traffic_data_enhanced(self, traffic_data: List[Dict[str, Any]]) -> bool:
-        """Validate traffic data quality"""
+        """Enhanced validation for traffic data including pedestrian inference and city-specific fields"""
        if not traffic_data:
            return False
        required_fields = ['date']
        traffic_fields = ['traffic_volume', 'traffic_intensity', 'intensidad', 'trafico']
        enhanced_fields = ['pedestrian_count', 'congestion_level', 'source']
        city_specific_fields = ['city', 'measurement_point_id', 'district']
        valid_records = 0
-        for record in traffic_data:
+        enhanced_records = 0
-            # Check required fields
+        city_aware_records = 0
            if not all(field in record for field in required_fields):
                continue
-            # Check at least one traffic field exists
+        for record in traffic_data:
            record_score = 0
            # Check required fields
            if all(field in record and record[field] is not None for field in required_fields):
                record_score += 1
            # Check traffic data fields
            if any(field in record and record[field] is not None for field in traffic_fields):
                record_score += 1
            # Check enhanced fields (pedestrian inference, etc.)
            enhanced_count = sum(1 for field in enhanced_fields 
                               if field in record and record[field] is not None)
            if enhanced_count >= 2:  # At least 2 enhanced fields
                enhanced_records += 1
                record_score += 1
            # Check city-specific awareness
            city_count = sum(1 for field in city_specific_fields 
                           if field in record and record[field] is not None)
            if city_count >= 1:  # At least some city awareness
                city_aware_records += 1
            # Record is valid if it has basic requirements
            if record_score >= 2:
                valid_records += 1
-        # Consider valid if at least 30% of records are valid (traffic data is often sparse)
+        total_records = len(traffic_data)
        validity_threshold = 0.3
-        is_valid = (valid_records / len(traffic_data)) >= validity_threshold
+        enhancement_threshold = 0.2  # Lower threshold for enhanced features
-        if not is_valid:
+        basic_validity = (valid_records / total_records) >= validity_threshold
-            logger.warning(f"Traffic data validation failed: {valid_records}/{len(traffic_data)} valid records")
+        has_enhancements = (enhanced_records / total_records) >= enhancement_threshold
        has_city_awareness = (city_aware_records / total_records) >= enhancement_threshold
-        return is_valid
+        logger.info("Enhanced traffic data validation results",
                   total_records=total_records,
                   valid_records=valid_records,
                   enhanced_records=enhanced_records,
                   city_aware_records=city_aware_records,
                   basic_validity=basic_validity,
                   has_enhancements=has_enhancements,
                   has_city_awareness=has_city_awareness)
        if not basic_validity:
            logger.warning(f"Traffic data basic validation failed: {valid_records}/{total_records} valid records")
        return basic_validity
    def _validate_traffic_data(self, traffic_data: List[Dict[str, Any]]) -> bool:
        """Legacy validation method - redirects to enhanced version"""
        return self._validate_traffic_data_enhanced(traffic_data)
    def _validate_data_sources(
        self,
--- a/shared/database/repository.py
+++ b/shared/database/repository.py
@@ -300,8 +300,14 @@ class BaseRepository(Generic[Model, CreateSchema, UpdateSchema], ABC):
            self.session.add_all(db_objects)
            await self.session.flush()
-            for db_obj in db_objects:
+            # Skip expensive individual refresh operations for large datasets
-                await self.session.refresh(db_obj)
+            # Only refresh if we have a small number of objects
            if len(db_objects) <= 100:
                for db_obj in db_objects:
                    await self.session.refresh(db_obj)
            else:
                # For large datasets, just log without refresh to prevent memory issues
                logger.debug(f"Skipped individual refresh for large bulk operation ({len(db_objects)} records)")
            logger.debug(f"Bulk created {len(db_objects)} {self.model.__name__} records")
            return db_objects
--- a/tests/test_onboarding_flow.sh
+++ b/tests/test_onboarding_flow.sh
@@ -777,7 +777,7 @@ log_step "5.1. Testing basic dashboard functionality"
 # forecast request with proper schema
 FORECAST_REQUEST="{
    \"product_name\": \"pan\",
-    \"forecast_date\": \"2025-08-08\",
+    \"forecast_date\": \"2025-08-10\",
    \"forecast_days\": 1,
    \"location\": \"madrid_centro\",
    \"confidence_level\": 0.85