REFACTOR external service and improve websocket training

This commit is contained in:
Urtzi Alfaro
2025-10-09 14:11:02 +02:00
parent 7c72f83c51
commit 3c689b4f98
111 changed files with 13289 additions and 2374 deletions

View File

@@ -842,10 +842,19 @@ class AEMETClient(BaseAPIClient):
"""Fetch forecast data from AEMET API"""
endpoint = f"/prediccion/especifica/municipio/diaria/{municipality_code}"
initial_response = await self._get(endpoint)
# Check for AEMET error responses
if initial_response and isinstance(initial_response, dict):
aemet_estado = initial_response.get("estado")
if aemet_estado == 404 or aemet_estado == "404":
logger.warning("AEMET API returned 404 error",
mensaje=initial_response.get("descripcion"),
municipality=municipality_code)
return None
if not self._is_valid_initial_response(initial_response):
return None
datos_url = initial_response.get("datos")
return await self._fetch_from_url(datos_url)
@@ -854,42 +863,65 @@ class AEMETClient(BaseAPIClient):
# Note: AEMET hourly forecast API endpoint
endpoint = f"/prediccion/especifica/municipio/horaria/{municipality_code}"
logger.info("Requesting AEMET hourly forecast", endpoint=endpoint, municipality=municipality_code)
initial_response = await self._get(endpoint)
# Check for AEMET error responses
if initial_response and isinstance(initial_response, dict):
aemet_estado = initial_response.get("estado")
if aemet_estado == 404 or aemet_estado == "404":
logger.warning("AEMET API returned 404 error for hourly forecast",
mensaje=initial_response.get("descripcion"),
municipality=municipality_code)
return None
if not self._is_valid_initial_response(initial_response):
logger.warning("Invalid initial response from AEMET hourly API",
logger.warning("Invalid initial response from AEMET hourly API",
response=initial_response, municipality=municipality_code)
return None
datos_url = initial_response.get("datos")
logger.info("Fetching hourly data from AEMET datos URL", url=datos_url)
return await self._fetch_from_url(datos_url)
async def _fetch_historical_data_in_chunks(self,
station_id: str,
start_date: datetime,
async def _fetch_historical_data_in_chunks(self,
station_id: str,
start_date: datetime,
end_date: datetime) -> List[Dict[str, Any]]:
"""Fetch historical data in chunks due to AEMET API limitations"""
import asyncio
historical_data = []
current_date = start_date
chunk_count = 0
while current_date <= end_date:
chunk_end_date = min(
current_date + timedelta(days=AEMETConstants.MAX_DAYS_PER_REQUEST),
current_date + timedelta(days=AEMETConstants.MAX_DAYS_PER_REQUEST),
end_date
)
# Add delay to respect rate limits (AEMET allows ~60 requests/minute)
# Wait 2 seconds between requests to stay well under the limit
if chunk_count > 0:
await asyncio.sleep(2)
chunk_data = await self._fetch_historical_chunk(
station_id, current_date, chunk_end_date
)
if chunk_data:
historical_data.extend(chunk_data)
current_date = chunk_end_date + timedelta(days=1)
chunk_count += 1
# Log progress every 5 chunks
if chunk_count % 5 == 0:
logger.info("Historical data fetch progress",
chunks_fetched=chunk_count,
records_so_far=len(historical_data))
return historical_data
async def _fetch_historical_chunk(self,
@@ -930,13 +962,37 @@ class AEMETClient(BaseAPIClient):
"""Fetch data from AEMET datos URL"""
try:
data = await self._fetch_url_directly(url)
if data and isinstance(data, list):
return data
else:
logger.warning("Expected list from datos URL", data_type=type(data))
if data is None:
logger.warning("No data received from datos URL", url=url)
return None
# Check if we got an AEMET error response (dict with estado/descripcion)
if isinstance(data, dict):
aemet_estado = data.get("estado")
aemet_mensaje = data.get("descripcion")
if aemet_estado or aemet_mensaje:
logger.warning("AEMET datos URL returned error response",
estado=aemet_estado,
mensaje=aemet_mensaje,
url=url)
return None
else:
# It's a dict but not an error response - unexpected format
logger.warning("Expected list from datos URL but got dict",
data_type=type(data),
keys=list(data.keys())[:5],
url=url)
return None
if isinstance(data, list):
return data
logger.warning("Unexpected data type from datos URL",
data_type=type(data), url=url)
return None
except Exception as e:
logger.error("Failed to fetch from datos URL", url=url, error=str(e))
return None

View File

@@ -318,49 +318,86 @@ class MadridTrafficClient(BaseTrafficClient, BaseAPIClient):
async def _process_historical_zip_enhanced(self, zip_content: bytes, zip_url: str,
latitude: float, longitude: float,
nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
"""Process historical ZIP file with enhanced parsing"""
"""Process historical ZIP file with memory-efficient streaming"""
try:
import zipfile
import io
import csv
import gc
historical_records = []
nearest_ids = {p[0] for p in nearest_points}
with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
csv_files = [f for f in zip_file.namelist() if f.lower().endswith('.csv')]
for csv_filename in csv_files:
try:
# Read CSV content
# Stream CSV file line-by-line to avoid loading entire file into memory
with zip_file.open(csv_filename) as csv_file:
text_content = csv_file.read().decode('utf-8', errors='ignore')
# Process CSV in chunks using processor
csv_records = await self.processor.process_csv_content_chunked(
text_content, csv_filename, nearest_ids, nearest_points
)
historical_records.extend(csv_records)
# Force garbage collection
# Use TextIOWrapper for efficient line-by-line reading
import codecs
text_wrapper = codecs.iterdecode(csv_file, 'utf-8', errors='ignore')
csv_reader = csv.DictReader(text_wrapper, delimiter=';')
# Process in small batches
batch_size = 5000
batch_records = []
row_count = 0
for row in csv_reader:
row_count += 1
measurement_point_id = row.get('id', '').strip()
# Skip rows we don't need
if measurement_point_id not in nearest_ids:
continue
try:
record_data = await self.processor.parse_historical_csv_row(row, nearest_points)
if record_data:
batch_records.append(record_data)
# Store and clear batch when full
if len(batch_records) >= batch_size:
historical_records.extend(batch_records)
batch_records = []
gc.collect()
except Exception:
continue
# Store remaining records
if batch_records:
historical_records.extend(batch_records)
batch_records = []
self.logger.info("CSV file processed",
filename=csv_filename,
rows_scanned=row_count,
records_extracted=len(historical_records))
# Aggressive garbage collection after each CSV
gc.collect()
except Exception as csv_error:
self.logger.warning("Error processing CSV file",
filename=csv_filename,
self.logger.warning("Error processing CSV file",
filename=csv_filename,
error=str(csv_error))
continue
self.logger.info("Historical ZIP processing completed",
self.logger.info("Historical ZIP processing completed",
zip_url=zip_url,
total_records=len(historical_records))
# Final cleanup
del zip_content
gc.collect()
return historical_records
except Exception as e:
self.logger.error("Error processing historical ZIP file",
self.logger.error("Error processing historical ZIP file",
zip_url=zip_url, error=str(e))
return []

View File

@@ -50,8 +50,20 @@ class BaseAPIClient:
return response_data
except httpx.HTTPStatusError as e:
logger.error("HTTP error", status_code=e.response.status_code, url=url,
logger.error("HTTP error", status_code=e.response.status_code, url=url,
response_text=e.response.text[:200], attempt=attempt + 1)
# Handle rate limiting (429) with longer backoff
if e.response.status_code == 429:
import asyncio
# Exponential backoff: 5s, 15s, 45s for rate limits
wait_time = 5 * (3 ** attempt)
logger.warning(f"Rate limit hit, waiting {wait_time}s before retry",
attempt=attempt + 1, max_attempts=self.retries)
await asyncio.sleep(wait_time)
if attempt < self.retries - 1:
continue
if attempt == self.retries - 1: # Last attempt
return None
except httpx.RequestError as e:
@@ -72,51 +84,87 @@ class BaseAPIClient:
return None
async def _fetch_url_directly(self, url: str, headers: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
"""Fetch data directly from a full URL (for AEMET datos URLs)"""
try:
request_headers = headers or {}
logger.debug("Making direct URL request", url=url)
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(url, headers=request_headers)
response.raise_for_status()
# Handle encoding issues common with Spanish data sources
try:
response_data = response.json()
except UnicodeDecodeError:
logger.warning("UTF-8 decode failed, trying alternative encodings", url=url)
# Try common Spanish encodings
for encoding in ['latin-1', 'windows-1252', 'iso-8859-1']:
try:
text_content = response.content.decode(encoding)
import json
response_data = json.loads(text_content)
logger.info("Successfully decoded with encoding", encoding=encoding)
break
except (UnicodeDecodeError, json.JSONDecodeError):
continue
else:
logger.error("Failed to decode response with any encoding", url=url)
return None
logger.debug("Direct URL response received",
status_code=response.status_code,
data_type=type(response_data),
data_length=len(response_data) if isinstance(response_data, (list, dict)) else "unknown")
return response_data
except httpx.HTTPStatusError as e:
logger.error("HTTP error in direct fetch", status_code=e.response.status_code, url=url)
return None
except httpx.RequestError as e:
logger.error("Request error in direct fetch", error=str(e), url=url)
return None
except Exception as e:
logger.error("Unexpected error in direct fetch", error=str(e), url=url)
return None
"""Fetch data directly from a full URL (for AEMET datos URLs) with retry logic"""
request_headers = headers or {}
logger.debug("Making direct URL request", url=url)
# Retry logic for unstable AEMET datos URLs
for attempt in range(self.retries):
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.get(url, headers=request_headers)
response.raise_for_status()
# Handle encoding issues common with Spanish data sources
try:
response_data = response.json()
except UnicodeDecodeError:
logger.warning("UTF-8 decode failed, trying alternative encodings", url=url)
# Try common Spanish encodings
for encoding in ['latin-1', 'windows-1252', 'iso-8859-1']:
try:
text_content = response.content.decode(encoding)
import json
response_data = json.loads(text_content)
logger.info("Successfully decoded with encoding", encoding=encoding)
break
except (UnicodeDecodeError, json.JSONDecodeError):
continue
else:
logger.error("Failed to decode response with any encoding", url=url)
if attempt < self.retries - 1:
continue
return None
logger.debug("Direct URL response received",
status_code=response.status_code,
data_type=type(response_data),
data_length=len(response_data) if isinstance(response_data, (list, dict)) else "unknown")
return response_data
except httpx.HTTPStatusError as e:
logger.error("HTTP error in direct fetch",
status_code=e.response.status_code,
url=url,
attempt=attempt + 1)
# On last attempt, return None
if attempt == self.retries - 1:
return None
# Wait before retry
import asyncio
wait_time = 2 ** attempt # 1s, 2s, 4s
logger.info(f"Retrying datos URL in {wait_time}s",
attempt=attempt + 1, max_attempts=self.retries)
await asyncio.sleep(wait_time)
except httpx.RequestError as e:
logger.error("Request error in direct fetch",
error=str(e), url=url, attempt=attempt + 1)
# On last attempt, return None
if attempt == self.retries - 1:
return None
# Wait before retry
import asyncio
wait_time = 2 ** attempt # 1s, 2s, 4s
logger.info(f"Retrying datos URL in {wait_time}s",
attempt=attempt + 1, max_attempts=self.retries)
await asyncio.sleep(wait_time)
except Exception as e:
logger.error("Unexpected error in direct fetch",
error=str(e), url=url, attempt=attempt + 1)
# On last attempt, return None
if attempt == self.retries - 1:
return None
return None
async def _post(self, endpoint: str, data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
"""Make POST request"""