REFACTOR data service

2025-08-12 18:17:30 +02:00
parent 7c237c0acc
commit fbe7470ad9
149 changed files with 8528 additions and 7393 deletions
--- a/services/external/app/repositories/init.py
+++ b/services/external/app/repositories/init.py
--- a/services/external/app/repositories/traffic_repository.py
+++ b/services/external/app/repositories/traffic_repository.py
@@ -0,0 +1,191 @@
+# ================================================================
+# services/data/app/repositories/traffic_repository.py
+# ================================================================
+"""
+Traffic Repository - Enhanced for multiple cities with comprehensive data access patterns
+Follows existing repository architecture while adding city-specific functionality
+"""
+
+from typing import Optional, List, Dict, Any, Type, Tuple
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, and_, or_, func, desc, asc, text, update, delete
+from sqlalchemy.orm import selectinload
+from datetime import datetime, timezone, timedelta
+import structlog
+
+from app.models.traffic import TrafficData
+from app.schemas.traffic import TrafficDataCreate, TrafficDataResponse
+from shared.database.exceptions import DatabaseError, ValidationError
+
+logger = structlog.get_logger()
+
+
+class TrafficRepository:
+    """
+    Enhanced repository for traffic data operations across multiple cities
+    Provides city-aware queries and advanced traffic analytics
+    """
+    
+    def __init__(self, session: AsyncSession):
+        self.session = session
+        self.model = TrafficData
+    
+    # ================================================================
+    # CORE TRAFFIC DATA OPERATIONS
+    # ================================================================
+    
+    async def get_by_location_and_date_range(
+        self,
+        latitude: float,
+        longitude: float,
+        start_date: datetime,
+        end_date: datetime,
+        tenant_id: Optional[str] = None
+    ) -> List[TrafficData]:
+        """Get traffic data by location and date range"""
+        try:
+            location_id = f"{latitude:.4f},{longitude:.4f}"
+            
+            # Build base query
+            query = select(self.model).where(self.model.location_id == location_id)
+            
+            # Add tenant filter if specified
+            if tenant_id:
+                query = query.where(self.model.tenant_id == tenant_id)
+            
+            # Add date range filters
+            if start_date:
+                query = query.where(self.model.date >= start_date)
+            
+            if end_date:
+                query = query.where(self.model.date <= end_date)
+            
+            # Order by date
+            query = query.order_by(self.model.date)
+            
+            result = await self.session.execute(query)
+            return result.scalars().all()
+            
+        except Exception as e:
+            logger.error("Failed to get traffic data by location and date range",
+                        latitude=latitude, longitude=longitude,
+                        error=str(e))
+            raise DatabaseError(f"Failed to get traffic data: {str(e)}")
+    
+    async def store_traffic_data_batch(
+        self, 
+        traffic_data_list: List[Dict[str, Any]], 
+        location_id: str,
+        tenant_id: Optional[str] = None
+    ) -> int:
+        """Store a batch of traffic data records with enhanced validation and duplicate handling."""
+        stored_count = 0
+        try:
+            if not traffic_data_list:
+                return 0
+
+            # Check for existing records to avoid duplicates
+            dates = [data.get('date') for data in traffic_data_list if data.get('date')]
+            existing_dates = set()
+            if dates:
+                existing_stmt = select(TrafficData.date).where(
+                    and_(
+                        TrafficData.location_id == location_id,
+                        TrafficData.date.in_(dates)
+                    )
+                )
+                result = await self.session.execute(existing_stmt)
+                existing_dates = {row[0] for row in result.fetchall()}
+                logger.debug(f"Found {len(existing_dates)} existing records for location {location_id}")
+
+            batch_records = []
+            for data in traffic_data_list:
+                record_date = data.get('date')
+                if not record_date or record_date in existing_dates:
+                    continue  # Skip duplicates
+
+                # Validate data before preparing for insertion
+                if self._validate_traffic_data(data):
+                    batch_records.append({
+                        'location_id': location_id,
+                        'city': data.get('city', 'madrid'),  # Default to madrid for historical data
+                        'tenant_id': tenant_id,  # Include tenant_id in batch insert
+                        'date': record_date,
+                        'traffic_volume': data.get('traffic_volume'),
+                        'pedestrian_count': data.get('pedestrian_count'),
+                        'congestion_level': data.get('congestion_level'),
+                        'average_speed': data.get('average_speed'),
+                        'source': data.get('source', 'unknown'),
+                        'raw_data': str(data)
+                    })
+            
+            if batch_records:
+                # Use bulk insert for performance
+                await self.session.execute(
+                    TrafficData.__table__.insert(),
+                    batch_records
+                )
+                await self.session.commit()
+                stored_count = len(batch_records)
+                logger.info(f"Successfully stored {stored_count} traffic records for location {location_id}")
+
+        except Exception as e:
+            logger.error("Failed to store traffic data batch",
+                        error=str(e), location_id=location_id)
+            await self.session.rollback()
+            raise DatabaseError(f"Batch store failed: {str(e)}")
+        
+        return stored_count
+
+    def _validate_traffic_data(self, data: Dict[str, Any]) -> bool:
+        """Validate traffic data before storage"""
+        required_fields = ['date']
+        
+        # Check required fields
+        for field in required_fields:
+            if not data.get(field):
+                return False
+        
+        # Validate data types and ranges
+        traffic_volume = data.get('traffic_volume')
+        if traffic_volume is not None and (traffic_volume < 0 or traffic_volume > 10000):
+            return False
+        
+        pedestrian_count = data.get('pedestrian_count')
+        if pedestrian_count is not None and (pedestrian_count < 0 or pedestrian_count > 10000):
+            return False
+        
+        average_speed = data.get('average_speed')
+        if average_speed is not None and (average_speed < 0 or average_speed > 200):
+            return False
+        
+        congestion_level = data.get('congestion_level')
+        if congestion_level and congestion_level not in ['low', 'medium', 'high', 'blocked']:
+            return False
+        
+        return True
+    
+    async def get_historical_traffic_for_training(self,
+                                                latitude: float,
+                                                longitude: float,
+                                                start_date: datetime,
+                                                end_date: datetime) -> List[TrafficData]:
+        """Retrieve stored traffic data for training ML models."""
+        try:
+            location_id = f"{latitude:.4f},{longitude:.4f}"
+            
+            stmt = select(TrafficData).where(
+                and_(
+                    TrafficData.location_id == location_id,
+                    TrafficData.date >= start_date,
+                    TrafficData.date <= end_date
+                )
+            ).order_by(TrafficData.date)
+            
+            result = await self.session.execute(stmt)
+            return result.scalars().all()
+            
+        except Exception as e:
+            logger.error("Failed to retrieve traffic data for training",
+                        error=str(e), location_id=location_id)
+            raise DatabaseError(f"Training data retrieval failed: {str(e)}")
--- a/services/external/app/repositories/weather_repository.py
+++ b/services/external/app/repositories/weather_repository.py
@@ -0,0 +1,138 @@
+# services/external/app/repositories/weather_repository.py
+
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from sqlalchemy import select, and_
+from sqlalchemy.ext.asyncio import AsyncSession
+import structlog
+import json
+
+from app.models.weather import WeatherData
+
+logger = structlog.get_logger()
+
+class WeatherRepository:
+    """
+    Repository for weather data operations, adapted for WeatherService.
+    """
+
+    def __init__(self, session: AsyncSession):
+        self.session = session
+
+    async def get_historical_weather(self,
+                                   location_id: str,
+                                   start_date: datetime,
+                                   end_date: datetime) -> List[WeatherData]:
+        """
+        Retrieves historical weather data for a specific location and date range.
+        This method directly supports the data retrieval logic in WeatherService.
+        """
+        try:
+            stmt = select(WeatherData).where(
+                and_(
+                    WeatherData.location_id == location_id,
+                    WeatherData.date >= start_date,
+                    WeatherData.date <= end_date
+                )
+            ).order_by(WeatherData.date)
+
+            result = await self.session.execute(stmt)
+            records = result.scalars().all()
+            logger.debug(f"Retrieved {len(records)} historical records for location {location_id}")
+            return list(records)
+
+        except Exception as e:
+            logger.error(
+                "Failed to get historical weather from repository",
+                error=str(e),
+                location_id=location_id
+            )
+            raise
+
+    def _serialize_json_fields(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Serialize JSON fields (raw_data, processed_data) to ensure proper JSON storage
+        """
+        serialized = data.copy()
+        
+        # Serialize raw_data if present
+        if 'raw_data' in serialized and serialized['raw_data'] is not None:
+            if not isinstance(serialized['raw_data'], str):
+                try:
+                    # Convert datetime objects to strings for JSON serialization
+                    raw_data = serialized['raw_data']
+                    if isinstance(raw_data, dict):
+                        # Handle datetime objects in the dict
+                        json_safe_data = {}
+                        for k, v in raw_data.items():
+                            if hasattr(v, 'isoformat'):  # datetime-like object
+                                json_safe_data[k] = v.isoformat()
+                            else:
+                                json_safe_data[k] = v
+                        serialized['raw_data'] = json_safe_data
+                except Exception as e:
+                    logger.warning(f"Could not serialize raw_data, storing as string: {e}")
+                    serialized['raw_data'] = str(raw_data)
+        
+        # Serialize processed_data if present  
+        if 'processed_data' in serialized and serialized['processed_data'] is not None:
+            if not isinstance(serialized['processed_data'], str):
+                try:
+                    processed_data = serialized['processed_data']
+                    if isinstance(processed_data, dict):
+                        json_safe_data = {}
+                        for k, v in processed_data.items():
+                            if hasattr(v, 'isoformat'):  # datetime-like object
+                                json_safe_data[k] = v.isoformat()
+                            else:
+                                json_safe_data[k] = v
+                        serialized['processed_data'] = json_safe_data
+                except Exception as e:
+                    logger.warning(f"Could not serialize processed_data, storing as string: {e}")
+                    serialized['processed_data'] = str(processed_data)
+        
+        return serialized
+
+    async def bulk_create_weather_data(self, weather_records: List[Dict[str, Any]]) -> None:
+        """
+        Bulk inserts new weather records into the database.
+        Used by WeatherService after fetching new historical data from an external API.
+        """
+        try:
+            if not weather_records:
+                return
+
+            # Serialize JSON fields before creating model instances
+            serialized_records = [self._serialize_json_fields(data) for data in weather_records]
+            records = [WeatherData(**data) for data in serialized_records]
+            self.session.add_all(records)
+            await self.session.commit()
+            logger.info(f"Successfully bulk inserted {len(records)} weather records")
+
+        except Exception as e:
+            await self.session.rollback()
+            logger.error(
+                "Failed to bulk create weather records",
+                error=str(e),
+                count=len(weather_records)
+            )
+            raise
+
+    async def create_weather_data(self, data: Dict[str, Any]) -> WeatherData:
+        """
+        Creates a single new weather data record.
+        """
+        try:
+            # Serialize JSON fields before creating model instance
+            serialized_data = self._serialize_json_fields(data)
+            new_record = WeatherData(**serialized_data)
+            self.session.add(new_record)
+            await self.session.commit()
+            await self.session.refresh(new_record)
+            logger.info(f"Created new weather record with ID {new_record.id}")
+            return new_record
+            
+        except Exception as e:
+            await self.session.rollback()
+            logger.error("Failed to create single weather record", error=str(e))
+            raise