Improve the traffic fetching system

2025-08-08 23:29:48 +02:00
parent 8af17f1433
commit 312fdc8ef3
8 changed files with 680 additions and 51 deletions
--- a/services/data/app/api/traffic.py
+++ b/services/data/app/api/traffic.py
@@ -110,4 +110,60 @@ async def get_historical_traffic(
        raise
    except Exception as e:
        logger.error("Unexpected error in historical traffic API", error=str(e))
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+
+@router.post("/tenants/{tenant_id}/traffic/stored")
+async def get_stored_traffic_for_training(
+    request: HistoricalTrafficRequest,
+    db: AsyncSession = Depends(get_db),
+    tenant_id: UUID = Path(..., description="Tenant ID"),
+    current_user: Dict[str, Any] = Depends(get_current_user_dep),
+):
+    """Get stored traffic data specifically for training/re-training purposes"""
+    try:
+        # Validate date range
+        if request.end_date <= request.start_date:
+            raise HTTPException(status_code=400, detail="End date must be after start date")
+        
+        # Allow longer date ranges for training (up to 3 years)
+        if (request.end_date - request.start_date).days > 1095:
+            raise HTTPException(status_code=400, detail="Date range cannot exceed 3 years for training data")
+        
+        logger.info("Retrieving stored traffic data for training",
+                   tenant_id=str(tenant_id),
+                   location=f"{request.latitude},{request.longitude}",
+                   date_range=f"{request.start_date} to {request.end_date}")
+        
+        # Use the dedicated method for training data retrieval
+        stored_data = await traffic_service.get_stored_traffic_for_training(
+            request.latitude, request.longitude, request.start_date, request.end_date, db
+        )
+        
+        # Log retrieval for audit purposes
+        logger.info("Stored traffic data retrieved for training",
+                   records_count=len(stored_data),
+                   tenant_id=str(tenant_id),
+                   purpose="model_training")
+        
+        # Publish event for monitoring
+        try:
+            await publish_traffic_updated({
+                "type": "stored_data_retrieved_for_training",
+                "latitude": request.latitude,
+                "longitude": request.longitude,
+                "start_date": request.start_date.isoformat(),
+                "end_date": request.end_date.isoformat(),
+                "records_count": len(stored_data),
+                "tenant_id": str(tenant_id),
+                "timestamp": datetime.utcnow().isoformat()
+            })
+        except Exception as pub_error:
+            logger.warning("Failed to publish stored traffic retrieval event", error=str(pub_error))
+        
+        return stored_data
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("Unexpected error in stored traffic retrieval API", error=str(e))
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
--- a/services/data/app/services/traffic_service.py
+++ b/services/data/app/services/traffic_service.py
@@ -63,7 +63,7 @@ class TrafficService:
                                   start_date: datetime, 
                                   end_date: datetime,
                                   db: AsyncSession) -> List[TrafficDataResponse]:
-        """Get historical traffic data"""
+        """Get historical traffic data with enhanced storage for re-training"""
        try:
            logger.debug("Getting historical traffic", 
                        lat=latitude, lon=longitude, 
@@ -100,27 +100,12 @@ class TrafficService:
            )
            
            if traffic_data:
-                # Store in database for future use
-                try:
-                    for data in traffic_data:
-                        traffic_record = TrafficData(
-                            location_id=location_id,
-                            date=data.get('date', datetime.now()),
-                            traffic_volume=data.get('traffic_volume'),
-                            pedestrian_count=data.get('pedestrian_count'),
-                            congestion_level=data.get('congestion_level'),
-                            average_speed=data.get('average_speed'),
-                            source="madrid_opendata",
-                            raw_data=str(data),
-                            created_at=datetime.now()
-                        )
-                        db.add(traffic_record)
-                    
-                    await db.commit()
-                    logger.debug("Historical data stored in database", count=len(traffic_data))
-                except Exception as db_error:
-                    logger.warning("Failed to store historical data in database", error=str(db_error))
-                    await db.rollback()
+                # Enhanced storage with better error handling and validation
+                stored_count = await self._store_traffic_data_batch(
+                    traffic_data, location_id, db
+                )
+                logger.info("Traffic data stored for re-training", 
+                           fetched=len(traffic_data), stored=stored_count, location=location_id)

                return [TrafficDataResponse(**item) for item in traffic_data]
                
@@ -137,7 +122,7 @@ class TrafficService:
                               longitude: float, 
                               traffic_data: Dict[str, Any],
                               db: AsyncSession) -> bool:
-        """Store traffic data to database"""
+        """Store single traffic data record to database"""
        try:
            location_id = f"{latitude:.4f},{longitude:.4f}"
            
@@ -161,4 +146,152 @@ class TrafficService:
        except Exception as e:
            logger.error("Failed to store traffic data", error=str(e))
            await db.rollback()
-            return False
+            return False
+    
+    async def _store_traffic_data_batch(self, 
+                                      traffic_data: List[Dict[str, Any]], 
+                                      location_id: str,
+                                      db: AsyncSession) -> int:
+        """Store batch of traffic data with enhanced validation and duplicate handling"""
+        stored_count = 0
+        
+        try:
+            # Check for existing records to avoid duplicates
+            if traffic_data:
+                dates = [data.get('date') for data in traffic_data if data.get('date')]
+                if dates:
+                    # Query existing records for this location and date range
+                    existing_stmt = select(TrafficData.date).where(
+                        and_(
+                            TrafficData.location_id == location_id,
+                            TrafficData.date.in_(dates)
+                        )
+                    )
+                    result = await db.execute(existing_stmt)
+                    existing_dates = {row[0] for row in result.fetchall()}
+                    
+                    logger.debug(f"Found {len(existing_dates)} existing records for location {location_id}")
+                else:
+                    existing_dates = set()
+            else:
+                existing_dates = set()
+            
+            # Store only new records
+            for data in traffic_data:
+                try:
+                    record_date = data.get('date')
+                    if not record_date or record_date in existing_dates:
+                        continue  # Skip duplicates
+                    
+                    # Validate required fields
+                    if not self._validate_traffic_data(data):
+                        logger.warning("Invalid traffic data, skipping", data=data)
+                        continue
+                    
+                    traffic_record = TrafficData(
+                        location_id=location_id,
+                        date=record_date,
+                        traffic_volume=data.get('traffic_volume'),
+                        pedestrian_count=data.get('pedestrian_count'),
+                        congestion_level=data.get('congestion_level'),
+                        average_speed=data.get('average_speed'),
+                        source=data.get('source', 'madrid_opendata'),
+                        raw_data=str(data)
+                    )
+                    
+                    db.add(traffic_record)
+                    stored_count += 1
+                    
+                    # Commit in batches to avoid memory issues
+                    if stored_count % 100 == 0:
+                        await db.commit()
+                        logger.debug(f"Committed batch of {stored_count} records")
+                    
+                except Exception as record_error:
+                    logger.warning("Failed to store individual traffic record", 
+                                 error=str(record_error), data=data)
+                    continue
+            
+            # Final commit
+            await db.commit()
+            logger.info(f"Successfully stored {stored_count} traffic records for location {location_id}")
+            
+        except Exception as e:
+            logger.error("Failed to store traffic data batch", 
+                        error=str(e), location_id=location_id)
+            await db.rollback()
+            
+        return stored_count
+    
+    def _validate_traffic_data(self, data: Dict[str, Any]) -> bool:
+        """Validate traffic data before storage"""
+        required_fields = ['date']
+        
+        # Check required fields
+        for field in required_fields:
+            if not data.get(field):
+                return False
+        
+        # Validate data types and ranges
+        traffic_volume = data.get('traffic_volume')
+        if traffic_volume is not None and (traffic_volume < 0 or traffic_volume > 10000):
+            return False
+        
+        pedestrian_count = data.get('pedestrian_count')
+        if pedestrian_count is not None and (pedestrian_count < 0 or pedestrian_count > 10000):
+            return False
+        
+        average_speed = data.get('average_speed')
+        if average_speed is not None and (average_speed < 0 or average_speed > 200):
+            return False
+        
+        congestion_level = data.get('congestion_level')
+        if congestion_level and congestion_level not in ['low', 'medium', 'high', 'blocked']:
+            return False
+        
+        return True
+    
+    async def get_stored_traffic_for_training(self,
+                                            latitude: float,
+                                            longitude: float,
+                                            start_date: datetime,
+                                            end_date: datetime,
+                                            db: AsyncSession) -> List[Dict[str, Any]]:
+        """Retrieve stored traffic data specifically for training purposes"""
+        try:
+            location_id = f"{latitude:.4f},{longitude:.4f}"
+            
+            stmt = select(TrafficData).where(
+                and_(
+                    TrafficData.location_id == location_id,
+                    TrafficData.date >= start_date,
+                    TrafficData.date <= end_date
+                )
+            ).order_by(TrafficData.date)
+            
+            result = await db.execute(stmt)
+            records = result.scalars().all()
+            
+            # Convert to training format
+            training_data = []
+            for record in records:
+                training_data.append({
+                    'date': record.date,
+                    'traffic_volume': record.traffic_volume,
+                    'pedestrian_count': record.pedestrian_count,
+                    'congestion_level': record.congestion_level,
+                    'average_speed': record.average_speed,
+                    'location_id': record.location_id,
+                    'source': record.source,
+                    'measurement_point_id': record.raw_data  # Contains additional metadata
+                })
+            
+            logger.info(f"Retrieved {len(training_data)} traffic records for training", 
+                       location_id=location_id, start=start_date, end=end_date)
+            
+            return training_data
+            
+        except Exception as e:
+            logger.error("Failed to retrieve traffic data for training", 
+                        error=str(e), location_id=location_id)
+            return []
--- a/services/data/migrations/versions/001_create_traffic_data_table.py
+++ b/services/data/migrations/versions/001_create_traffic_data_table.py
@@ -0,0 +1,54 @@
+"""Create traffic_data table for storing traffic data for re-training
+
+Revision ID: 001_traffic_data
+Revises: 
+Create Date: 2025-01-08 12:00:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import UUID
+
+# revision identifiers, used by Alembic.
+revision = '001_traffic_data'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    """Create traffic_data table"""
+    op.create_table('traffic_data',
+        sa.Column('id', UUID(as_uuid=True), nullable=False, primary_key=True),
+        sa.Column('location_id', sa.String(100), nullable=False, index=True),
+        sa.Column('date', sa.DateTime(timezone=True), nullable=False, index=True),
+        sa.Column('traffic_volume', sa.Integer, nullable=True),
+        sa.Column('pedestrian_count', sa.Integer, nullable=True),
+        sa.Column('congestion_level', sa.String(20), nullable=True),
+        sa.Column('average_speed', sa.Float, nullable=True),
+        sa.Column('source', sa.String(50), nullable=False, server_default='madrid_opendata'),
+        sa.Column('raw_data', sa.Text, nullable=True),
+        sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+    )
+    
+    # Create index for efficient querying by location and date
+    op.create_index(
+        'idx_traffic_location_date', 
+        'traffic_data', 
+        ['location_id', 'date']
+    )
+    
+    # Create index for date range queries  
+    op.create_index(
+        'idx_traffic_date_range',
+        'traffic_data',
+        ['date']
+    )
+
+
+def downgrade():
+    """Drop traffic_data table"""
+    op.drop_index('idx_traffic_date_range', table_name='traffic_data')
+    op.drop_index('idx_traffic_location_date', table_name='traffic_data')
+    op.drop_table('traffic_data')