REFACTOR data service

This commit is contained in:
Urtzi Alfaro
2025-08-12 18:17:30 +02:00
parent 7c237c0acc
commit fbe7470ad9
149 changed files with 8528 additions and 7393 deletions

View File

View File

@@ -0,0 +1,191 @@
# ================================================================
# services/data/app/repositories/traffic_repository.py
# ================================================================
"""
Traffic Repository - Enhanced for multiple cities with comprehensive data access patterns
Follows existing repository architecture while adding city-specific functionality
"""
from typing import Optional, List, Dict, Any, Type, Tuple
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, or_, func, desc, asc, text, update, delete
from sqlalchemy.orm import selectinload
from datetime import datetime, timezone, timedelta
import structlog
from app.models.traffic import TrafficData
from app.schemas.traffic import TrafficDataCreate, TrafficDataResponse
from shared.database.exceptions import DatabaseError, ValidationError
logger = structlog.get_logger()
class TrafficRepository:
"""
Enhanced repository for traffic data operations across multiple cities
Provides city-aware queries and advanced traffic analytics
"""
def __init__(self, session: AsyncSession):
self.session = session
self.model = TrafficData
# ================================================================
# CORE TRAFFIC DATA OPERATIONS
# ================================================================
async def get_by_location_and_date_range(
self,
latitude: float,
longitude: float,
start_date: datetime,
end_date: datetime,
tenant_id: Optional[str] = None
) -> List[TrafficData]:
"""Get traffic data by location and date range"""
try:
location_id = f"{latitude:.4f},{longitude:.4f}"
# Build base query
query = select(self.model).where(self.model.location_id == location_id)
# Add tenant filter if specified
if tenant_id:
query = query.where(self.model.tenant_id == tenant_id)
# Add date range filters
if start_date:
query = query.where(self.model.date >= start_date)
if end_date:
query = query.where(self.model.date <= end_date)
# Order by date
query = query.order_by(self.model.date)
result = await self.session.execute(query)
return result.scalars().all()
except Exception as e:
logger.error("Failed to get traffic data by location and date range",
latitude=latitude, longitude=longitude,
error=str(e))
raise DatabaseError(f"Failed to get traffic data: {str(e)}")
async def store_traffic_data_batch(
self,
traffic_data_list: List[Dict[str, Any]],
location_id: str,
tenant_id: Optional[str] = None
) -> int:
"""Store a batch of traffic data records with enhanced validation and duplicate handling."""
stored_count = 0
try:
if not traffic_data_list:
return 0
# Check for existing records to avoid duplicates
dates = [data.get('date') for data in traffic_data_list if data.get('date')]
existing_dates = set()
if dates:
existing_stmt = select(TrafficData.date).where(
and_(
TrafficData.location_id == location_id,
TrafficData.date.in_(dates)
)
)
result = await self.session.execute(existing_stmt)
existing_dates = {row[0] for row in result.fetchall()}
logger.debug(f"Found {len(existing_dates)} existing records for location {location_id}")
batch_records = []
for data in traffic_data_list:
record_date = data.get('date')
if not record_date or record_date in existing_dates:
continue # Skip duplicates
# Validate data before preparing for insertion
if self._validate_traffic_data(data):
batch_records.append({
'location_id': location_id,
'city': data.get('city', 'madrid'), # Default to madrid for historical data
'tenant_id': tenant_id, # Include tenant_id in batch insert
'date': record_date,
'traffic_volume': data.get('traffic_volume'),
'pedestrian_count': data.get('pedestrian_count'),
'congestion_level': data.get('congestion_level'),
'average_speed': data.get('average_speed'),
'source': data.get('source', 'unknown'),
'raw_data': str(data)
})
if batch_records:
# Use bulk insert for performance
await self.session.execute(
TrafficData.__table__.insert(),
batch_records
)
await self.session.commit()
stored_count = len(batch_records)
logger.info(f"Successfully stored {stored_count} traffic records for location {location_id}")
except Exception as e:
logger.error("Failed to store traffic data batch",
error=str(e), location_id=location_id)
await self.session.rollback()
raise DatabaseError(f"Batch store failed: {str(e)}")
return stored_count
def _validate_traffic_data(self, data: Dict[str, Any]) -> bool:
"""Validate traffic data before storage"""
required_fields = ['date']
# Check required fields
for field in required_fields:
if not data.get(field):
return False
# Validate data types and ranges
traffic_volume = data.get('traffic_volume')
if traffic_volume is not None and (traffic_volume < 0 or traffic_volume > 10000):
return False
pedestrian_count = data.get('pedestrian_count')
if pedestrian_count is not None and (pedestrian_count < 0 or pedestrian_count > 10000):
return False
average_speed = data.get('average_speed')
if average_speed is not None and (average_speed < 0 or average_speed > 200):
return False
congestion_level = data.get('congestion_level')
if congestion_level and congestion_level not in ['low', 'medium', 'high', 'blocked']:
return False
return True
async def get_historical_traffic_for_training(self,
latitude: float,
longitude: float,
start_date: datetime,
end_date: datetime) -> List[TrafficData]:
"""Retrieve stored traffic data for training ML models."""
try:
location_id = f"{latitude:.4f},{longitude:.4f}"
stmt = select(TrafficData).where(
and_(
TrafficData.location_id == location_id,
TrafficData.date >= start_date,
TrafficData.date <= end_date
)
).order_by(TrafficData.date)
result = await self.session.execute(stmt)
return result.scalars().all()
except Exception as e:
logger.error("Failed to retrieve traffic data for training",
error=str(e), location_id=location_id)
raise DatabaseError(f"Training data retrieval failed: {str(e)}")

View File

@@ -0,0 +1,138 @@
# services/external/app/repositories/weather_repository.py
from typing import List, Dict, Any, Optional
from datetime import datetime
from sqlalchemy import select, and_
from sqlalchemy.ext.asyncio import AsyncSession
import structlog
import json
from app.models.weather import WeatherData
logger = structlog.get_logger()
class WeatherRepository:
"""
Repository for weather data operations, adapted for WeatherService.
"""
def __init__(self, session: AsyncSession):
self.session = session
async def get_historical_weather(self,
location_id: str,
start_date: datetime,
end_date: datetime) -> List[WeatherData]:
"""
Retrieves historical weather data for a specific location and date range.
This method directly supports the data retrieval logic in WeatherService.
"""
try:
stmt = select(WeatherData).where(
and_(
WeatherData.location_id == location_id,
WeatherData.date >= start_date,
WeatherData.date <= end_date
)
).order_by(WeatherData.date)
result = await self.session.execute(stmt)
records = result.scalars().all()
logger.debug(f"Retrieved {len(records)} historical records for location {location_id}")
return list(records)
except Exception as e:
logger.error(
"Failed to get historical weather from repository",
error=str(e),
location_id=location_id
)
raise
def _serialize_json_fields(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Serialize JSON fields (raw_data, processed_data) to ensure proper JSON storage
"""
serialized = data.copy()
# Serialize raw_data if present
if 'raw_data' in serialized and serialized['raw_data'] is not None:
if not isinstance(serialized['raw_data'], str):
try:
# Convert datetime objects to strings for JSON serialization
raw_data = serialized['raw_data']
if isinstance(raw_data, dict):
# Handle datetime objects in the dict
json_safe_data = {}
for k, v in raw_data.items():
if hasattr(v, 'isoformat'): # datetime-like object
json_safe_data[k] = v.isoformat()
else:
json_safe_data[k] = v
serialized['raw_data'] = json_safe_data
except Exception as e:
logger.warning(f"Could not serialize raw_data, storing as string: {e}")
serialized['raw_data'] = str(raw_data)
# Serialize processed_data if present
if 'processed_data' in serialized and serialized['processed_data'] is not None:
if not isinstance(serialized['processed_data'], str):
try:
processed_data = serialized['processed_data']
if isinstance(processed_data, dict):
json_safe_data = {}
for k, v in processed_data.items():
if hasattr(v, 'isoformat'): # datetime-like object
json_safe_data[k] = v.isoformat()
else:
json_safe_data[k] = v
serialized['processed_data'] = json_safe_data
except Exception as e:
logger.warning(f"Could not serialize processed_data, storing as string: {e}")
serialized['processed_data'] = str(processed_data)
return serialized
async def bulk_create_weather_data(self, weather_records: List[Dict[str, Any]]) -> None:
"""
Bulk inserts new weather records into the database.
Used by WeatherService after fetching new historical data from an external API.
"""
try:
if not weather_records:
return
# Serialize JSON fields before creating model instances
serialized_records = [self._serialize_json_fields(data) for data in weather_records]
records = [WeatherData(**data) for data in serialized_records]
self.session.add_all(records)
await self.session.commit()
logger.info(f"Successfully bulk inserted {len(records)} weather records")
except Exception as e:
await self.session.rollback()
logger.error(
"Failed to bulk create weather records",
error=str(e),
count=len(weather_records)
)
raise
async def create_weather_data(self, data: Dict[str, Any]) -> WeatherData:
"""
Creates a single new weather data record.
"""
try:
# Serialize JSON fields before creating model instance
serialized_data = self._serialize_json_fields(data)
new_record = WeatherData(**serialized_data)
self.session.add(new_record)
await self.session.commit()
await self.session.refresh(new_record)
logger.info(f"Created new weather record with ID {new_record.id}")
return new_record
except Exception as e:
await self.session.rollback()
logger.error("Failed to create single weather record", error=str(e))
raise