Improve the traffic fetching system
This commit is contained in:
@@ -110,4 +110,60 @@ async def get_historical_traffic(
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error in historical traffic API", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
|
||||
@router.post("/tenants/{tenant_id}/traffic/stored")
|
||||
async def get_stored_traffic_for_training(
|
||||
request: HistoricalTrafficRequest,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||||
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||||
):
|
||||
"""Get stored traffic data specifically for training/re-training purposes"""
|
||||
try:
|
||||
# Validate date range
|
||||
if request.end_date <= request.start_date:
|
||||
raise HTTPException(status_code=400, detail="End date must be after start date")
|
||||
|
||||
# Allow longer date ranges for training (up to 3 years)
|
||||
if (request.end_date - request.start_date).days > 1095:
|
||||
raise HTTPException(status_code=400, detail="Date range cannot exceed 3 years for training data")
|
||||
|
||||
logger.info("Retrieving stored traffic data for training",
|
||||
tenant_id=str(tenant_id),
|
||||
location=f"{request.latitude},{request.longitude}",
|
||||
date_range=f"{request.start_date} to {request.end_date}")
|
||||
|
||||
# Use the dedicated method for training data retrieval
|
||||
stored_data = await traffic_service.get_stored_traffic_for_training(
|
||||
request.latitude, request.longitude, request.start_date, request.end_date, db
|
||||
)
|
||||
|
||||
# Log retrieval for audit purposes
|
||||
logger.info("Stored traffic data retrieved for training",
|
||||
records_count=len(stored_data),
|
||||
tenant_id=str(tenant_id),
|
||||
purpose="model_training")
|
||||
|
||||
# Publish event for monitoring
|
||||
try:
|
||||
await publish_traffic_updated({
|
||||
"type": "stored_data_retrieved_for_training",
|
||||
"latitude": request.latitude,
|
||||
"longitude": request.longitude,
|
||||
"start_date": request.start_date.isoformat(),
|
||||
"end_date": request.end_date.isoformat(),
|
||||
"records_count": len(stored_data),
|
||||
"tenant_id": str(tenant_id),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
})
|
||||
except Exception as pub_error:
|
||||
logger.warning("Failed to publish stored traffic retrieval event", error=str(pub_error))
|
||||
|
||||
return stored_data
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error in stored traffic retrieval API", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
@@ -63,7 +63,7 @@ class TrafficService:
|
||||
start_date: datetime,
|
||||
end_date: datetime,
|
||||
db: AsyncSession) -> List[TrafficDataResponse]:
|
||||
"""Get historical traffic data"""
|
||||
"""Get historical traffic data with enhanced storage for re-training"""
|
||||
try:
|
||||
logger.debug("Getting historical traffic",
|
||||
lat=latitude, lon=longitude,
|
||||
@@ -100,27 +100,12 @@ class TrafficService:
|
||||
)
|
||||
|
||||
if traffic_data:
|
||||
# Store in database for future use
|
||||
try:
|
||||
for data in traffic_data:
|
||||
traffic_record = TrafficData(
|
||||
location_id=location_id,
|
||||
date=data.get('date', datetime.now()),
|
||||
traffic_volume=data.get('traffic_volume'),
|
||||
pedestrian_count=data.get('pedestrian_count'),
|
||||
congestion_level=data.get('congestion_level'),
|
||||
average_speed=data.get('average_speed'),
|
||||
source="madrid_opendata",
|
||||
raw_data=str(data),
|
||||
created_at=datetime.now()
|
||||
)
|
||||
db.add(traffic_record)
|
||||
|
||||
await db.commit()
|
||||
logger.debug("Historical data stored in database", count=len(traffic_data))
|
||||
except Exception as db_error:
|
||||
logger.warning("Failed to store historical data in database", error=str(db_error))
|
||||
await db.rollback()
|
||||
# Enhanced storage with better error handling and validation
|
||||
stored_count = await self._store_traffic_data_batch(
|
||||
traffic_data, location_id, db
|
||||
)
|
||||
logger.info("Traffic data stored for re-training",
|
||||
fetched=len(traffic_data), stored=stored_count, location=location_id)
|
||||
|
||||
return [TrafficDataResponse(**item) for item in traffic_data]
|
||||
|
||||
@@ -137,7 +122,7 @@ class TrafficService:
|
||||
longitude: float,
|
||||
traffic_data: Dict[str, Any],
|
||||
db: AsyncSession) -> bool:
|
||||
"""Store traffic data to database"""
|
||||
"""Store single traffic data record to database"""
|
||||
try:
|
||||
location_id = f"{latitude:.4f},{longitude:.4f}"
|
||||
|
||||
@@ -161,4 +146,152 @@ class TrafficService:
|
||||
except Exception as e:
|
||||
logger.error("Failed to store traffic data", error=str(e))
|
||||
await db.rollback()
|
||||
return False
|
||||
return False
|
||||
|
||||
async def _store_traffic_data_batch(self,
|
||||
traffic_data: List[Dict[str, Any]],
|
||||
location_id: str,
|
||||
db: AsyncSession) -> int:
|
||||
"""Store batch of traffic data with enhanced validation and duplicate handling"""
|
||||
stored_count = 0
|
||||
|
||||
try:
|
||||
# Check for existing records to avoid duplicates
|
||||
if traffic_data:
|
||||
dates = [data.get('date') for data in traffic_data if data.get('date')]
|
||||
if dates:
|
||||
# Query existing records for this location and date range
|
||||
existing_stmt = select(TrafficData.date).where(
|
||||
and_(
|
||||
TrafficData.location_id == location_id,
|
||||
TrafficData.date.in_(dates)
|
||||
)
|
||||
)
|
||||
result = await db.execute(existing_stmt)
|
||||
existing_dates = {row[0] for row in result.fetchall()}
|
||||
|
||||
logger.debug(f"Found {len(existing_dates)} existing records for location {location_id}")
|
||||
else:
|
||||
existing_dates = set()
|
||||
else:
|
||||
existing_dates = set()
|
||||
|
||||
# Store only new records
|
||||
for data in traffic_data:
|
||||
try:
|
||||
record_date = data.get('date')
|
||||
if not record_date or record_date in existing_dates:
|
||||
continue # Skip duplicates
|
||||
|
||||
# Validate required fields
|
||||
if not self._validate_traffic_data(data):
|
||||
logger.warning("Invalid traffic data, skipping", data=data)
|
||||
continue
|
||||
|
||||
traffic_record = TrafficData(
|
||||
location_id=location_id,
|
||||
date=record_date,
|
||||
traffic_volume=data.get('traffic_volume'),
|
||||
pedestrian_count=data.get('pedestrian_count'),
|
||||
congestion_level=data.get('congestion_level'),
|
||||
average_speed=data.get('average_speed'),
|
||||
source=data.get('source', 'madrid_opendata'),
|
||||
raw_data=str(data)
|
||||
)
|
||||
|
||||
db.add(traffic_record)
|
||||
stored_count += 1
|
||||
|
||||
# Commit in batches to avoid memory issues
|
||||
if stored_count % 100 == 0:
|
||||
await db.commit()
|
||||
logger.debug(f"Committed batch of {stored_count} records")
|
||||
|
||||
except Exception as record_error:
|
||||
logger.warning("Failed to store individual traffic record",
|
||||
error=str(record_error), data=data)
|
||||
continue
|
||||
|
||||
# Final commit
|
||||
await db.commit()
|
||||
logger.info(f"Successfully stored {stored_count} traffic records for location {location_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to store traffic data batch",
|
||||
error=str(e), location_id=location_id)
|
||||
await db.rollback()
|
||||
|
||||
return stored_count
|
||||
|
||||
def _validate_traffic_data(self, data: Dict[str, Any]) -> bool:
|
||||
"""Validate traffic data before storage"""
|
||||
required_fields = ['date']
|
||||
|
||||
# Check required fields
|
||||
for field in required_fields:
|
||||
if not data.get(field):
|
||||
return False
|
||||
|
||||
# Validate data types and ranges
|
||||
traffic_volume = data.get('traffic_volume')
|
||||
if traffic_volume is not None and (traffic_volume < 0 or traffic_volume > 10000):
|
||||
return False
|
||||
|
||||
pedestrian_count = data.get('pedestrian_count')
|
||||
if pedestrian_count is not None and (pedestrian_count < 0 or pedestrian_count > 10000):
|
||||
return False
|
||||
|
||||
average_speed = data.get('average_speed')
|
||||
if average_speed is not None and (average_speed < 0 or average_speed > 200):
|
||||
return False
|
||||
|
||||
congestion_level = data.get('congestion_level')
|
||||
if congestion_level and congestion_level not in ['low', 'medium', 'high', 'blocked']:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def get_stored_traffic_for_training(self,
|
||||
latitude: float,
|
||||
longitude: float,
|
||||
start_date: datetime,
|
||||
end_date: datetime,
|
||||
db: AsyncSession) -> List[Dict[str, Any]]:
|
||||
"""Retrieve stored traffic data specifically for training purposes"""
|
||||
try:
|
||||
location_id = f"{latitude:.4f},{longitude:.4f}"
|
||||
|
||||
stmt = select(TrafficData).where(
|
||||
and_(
|
||||
TrafficData.location_id == location_id,
|
||||
TrafficData.date >= start_date,
|
||||
TrafficData.date <= end_date
|
||||
)
|
||||
).order_by(TrafficData.date)
|
||||
|
||||
result = await db.execute(stmt)
|
||||
records = result.scalars().all()
|
||||
|
||||
# Convert to training format
|
||||
training_data = []
|
||||
for record in records:
|
||||
training_data.append({
|
||||
'date': record.date,
|
||||
'traffic_volume': record.traffic_volume,
|
||||
'pedestrian_count': record.pedestrian_count,
|
||||
'congestion_level': record.congestion_level,
|
||||
'average_speed': record.average_speed,
|
||||
'location_id': record.location_id,
|
||||
'source': record.source,
|
||||
'measurement_point_id': record.raw_data # Contains additional metadata
|
||||
})
|
||||
|
||||
logger.info(f"Retrieved {len(training_data)} traffic records for training",
|
||||
location_id=location_id, start=start_date, end=end_date)
|
||||
|
||||
return training_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to retrieve traffic data for training",
|
||||
error=str(e), location_id=location_id)
|
||||
return []
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Create traffic_data table for storing traffic data for re-training
|
||||
|
||||
Revision ID: 001_traffic_data
|
||||
Revises:
|
||||
Create Date: 2025-01-08 12:00:00.000000
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '001_traffic_data'
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
"""Create traffic_data table"""
|
||||
op.create_table('traffic_data',
|
||||
sa.Column('id', UUID(as_uuid=True), nullable=False, primary_key=True),
|
||||
sa.Column('location_id', sa.String(100), nullable=False, index=True),
|
||||
sa.Column('date', sa.DateTime(timezone=True), nullable=False, index=True),
|
||||
sa.Column('traffic_volume', sa.Integer, nullable=True),
|
||||
sa.Column('pedestrian_count', sa.Integer, nullable=True),
|
||||
sa.Column('congestion_level', sa.String(20), nullable=True),
|
||||
sa.Column('average_speed', sa.Float, nullable=True),
|
||||
sa.Column('source', sa.String(50), nullable=False, server_default='madrid_opendata'),
|
||||
sa.Column('raw_data', sa.Text, nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
|
||||
sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
|
||||
)
|
||||
|
||||
# Create index for efficient querying by location and date
|
||||
op.create_index(
|
||||
'idx_traffic_location_date',
|
||||
'traffic_data',
|
||||
['location_id', 'date']
|
||||
)
|
||||
|
||||
# Create index for date range queries
|
||||
op.create_index(
|
||||
'idx_traffic_date_range',
|
||||
'traffic_data',
|
||||
['date']
|
||||
)
|
||||
|
||||
|
||||
def downgrade():
|
||||
"""Drop traffic_data table"""
|
||||
op.drop_index('idx_traffic_date_range', table_name='traffic_data')
|
||||
op.drop_index('idx_traffic_location_date', table_name='traffic_data')
|
||||
op.drop_table('traffic_data')
|
||||
@@ -24,6 +24,13 @@ class DataClient:
|
||||
# Get the shared data client configured for this service
|
||||
self.data_client = get_data_client(settings, "training")
|
||||
|
||||
# Check if the new method is available for stored traffic data
|
||||
if hasattr(self.data_client, 'get_stored_traffic_data_for_training'):
|
||||
self.supports_stored_traffic_data = True
|
||||
else:
|
||||
self.supports_stored_traffic_data = False
|
||||
logger.warning("Stored traffic data method not available in data client")
|
||||
|
||||
# Or alternatively, get all clients at once:
|
||||
# self.clients = get_service_clients(settings, "training")
|
||||
# Then use: self.clients.data.get_sales_data(...)
|
||||
@@ -147,6 +154,51 @@ class DataClient:
|
||||
logger.error(f"Error fetching traffic data: {e}", tenant_id=tenant_id)
|
||||
return []
|
||||
|
||||
async def fetch_stored_traffic_data_for_training(
|
||||
self,
|
||||
tenant_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
latitude: Optional[float] = None,
|
||||
longitude: Optional[float] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch stored traffic data specifically for training/re-training
|
||||
This method accesses previously stored traffic data without making new API calls
|
||||
"""
|
||||
try:
|
||||
if self.supports_stored_traffic_data:
|
||||
# Use the dedicated stored traffic data method
|
||||
stored_traffic_data = await self.data_client.get_stored_traffic_data_for_training(
|
||||
tenant_id=tenant_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
latitude=latitude,
|
||||
longitude=longitude
|
||||
)
|
||||
|
||||
if stored_traffic_data:
|
||||
logger.info(f"Retrieved {len(stored_traffic_data)} stored traffic records for training",
|
||||
tenant_id=tenant_id)
|
||||
return stored_traffic_data
|
||||
else:
|
||||
logger.warning("No stored traffic data available for training", tenant_id=tenant_id)
|
||||
return []
|
||||
else:
|
||||
# Fallback to regular traffic data method
|
||||
logger.info("Using fallback traffic data method for training")
|
||||
return await self.fetch_traffic_data(
|
||||
tenant_id=tenant_id,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
latitude=latitude,
|
||||
longitude=longitude
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching stored traffic data for training: {e}", tenant_id=tenant_id)
|
||||
return []
|
||||
|
||||
async def validate_data_quality(
|
||||
self,
|
||||
tenant_id: str,
|
||||
|
||||
@@ -360,7 +360,7 @@ class TrainingDataOrchestrator:
|
||||
aligned_range: AlignedDateRange,
|
||||
tenant_id: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Collect traffic data with timeout and Madrid constraint validation"""
|
||||
"""Collect traffic data with enhanced storage and retrieval for re-training"""
|
||||
try:
|
||||
|
||||
# Double-check Madrid constraint before making request
|
||||
@@ -374,6 +374,7 @@ class TrainingDataOrchestrator:
|
||||
start_date_str = aligned_range.start.isoformat()
|
||||
end_date_str = aligned_range.end.isoformat()
|
||||
|
||||
# Fetch traffic data - this will automatically store it for future re-training
|
||||
traffic_data = await self.data_client.fetch_traffic_data(
|
||||
tenant_id=tenant_id,
|
||||
start_date=start_date_str,
|
||||
@@ -383,7 +384,11 @@ class TrainingDataOrchestrator:
|
||||
|
||||
# Validate traffic data
|
||||
if self._validate_traffic_data(traffic_data):
|
||||
logger.info(f"Collected {len(traffic_data)} valid traffic records")
|
||||
logger.info(f"Collected and stored {len(traffic_data)} valid traffic records for re-training")
|
||||
|
||||
# Log storage success for audit purposes
|
||||
self._log_traffic_data_storage(lat, lon, aligned_range, len(traffic_data))
|
||||
|
||||
return traffic_data
|
||||
else:
|
||||
logger.warning("Invalid traffic data received")
|
||||
@@ -396,6 +401,69 @@ class TrainingDataOrchestrator:
|
||||
logger.warning(f"Traffic data collection failed: {e}")
|
||||
return []
|
||||
|
||||
def _log_traffic_data_storage(self,
|
||||
lat: float,
|
||||
lon: float,
|
||||
aligned_range: AlignedDateRange,
|
||||
record_count: int):
|
||||
"""Log traffic data storage for audit and re-training tracking"""
|
||||
logger.info(
|
||||
"Traffic data stored for re-training",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
date_range=f"{aligned_range.start.isoformat()} to {aligned_range.end.isoformat()}",
|
||||
records_stored=record_count,
|
||||
storage_timestamp=datetime.now().isoformat(),
|
||||
purpose="model_training_and_retraining"
|
||||
)
|
||||
|
||||
async def retrieve_stored_traffic_for_retraining(
|
||||
self,
|
||||
bakery_location: Tuple[float, float],
|
||||
start_date: datetime,
|
||||
end_date: datetime,
|
||||
tenant_id: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve previously stored traffic data for model re-training
|
||||
This method specifically accesses the stored traffic data without making new API calls
|
||||
"""
|
||||
lat, lon = bakery_location
|
||||
|
||||
try:
|
||||
# Use the dedicated stored traffic data method for training
|
||||
stored_traffic_data = await self.data_client.fetch_stored_traffic_data_for_training(
|
||||
tenant_id=tenant_id,
|
||||
start_date=start_date.isoformat(),
|
||||
end_date=end_date.isoformat(),
|
||||
latitude=lat,
|
||||
longitude=lon
|
||||
)
|
||||
|
||||
if stored_traffic_data:
|
||||
logger.info(
|
||||
f"Retrieved {len(stored_traffic_data)} stored traffic records for re-training",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
date_range=f"{start_date.isoformat()} to {end_date.isoformat()}",
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
return stored_traffic_data
|
||||
else:
|
||||
logger.warning(
|
||||
"No stored traffic data found for re-training",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
date_range=f"{start_date.isoformat()} to {end_date.isoformat()}"
|
||||
)
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to retrieve stored traffic data for re-training: {e}",
|
||||
location=f"{lat:.4f},{lon:.4f}",
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
return []
|
||||
|
||||
def _validate_weather_data(self, weather_data: List[Dict[str, Any]]) -> bool:
|
||||
"""Validate weather data quality"""
|
||||
if not weather_data:
|
||||
|
||||
Reference in New Issue
Block a user