REFACTOR external service and improve websocket training

2025-10-09 14:11:02 +02:00
parent 7c72f83c51
commit 3c689b4f98
111 changed files with 13289 additions and 2374 deletions
--- a/services/external/app/external/aemet.py
+++ b/services/external/app/external/aemet.py
@@ -842,10 +842,19 @@ class AEMETClient(BaseAPIClient):
        """Fetch forecast data from AEMET API"""
        endpoint = f"/prediccion/especifica/municipio/diaria/{municipality_code}"
        initial_response = await self._get(endpoint)
-        
+
+        # Check for AEMET error responses
+        if initial_response and isinstance(initial_response, dict):
+            aemet_estado = initial_response.get("estado")
+            if aemet_estado == 404 or aemet_estado == "404":
+                logger.warning("AEMET API returned 404 error",
+                             mensaje=initial_response.get("descripcion"),
+                             municipality=municipality_code)
+                return None
+
        if not self._is_valid_initial_response(initial_response):
            return None
-        
+
        datos_url = initial_response.get("datos")
        return await self._fetch_from_url(datos_url)
    
@@ -854,42 +863,65 @@ class AEMETClient(BaseAPIClient):
        # Note: AEMET hourly forecast API endpoint
        endpoint = f"/prediccion/especifica/municipio/horaria/{municipality_code}"
        logger.info("Requesting AEMET hourly forecast", endpoint=endpoint, municipality=municipality_code)
-        
+
        initial_response = await self._get(endpoint)
-        
+
+        # Check for AEMET error responses
+        if initial_response and isinstance(initial_response, dict):
+            aemet_estado = initial_response.get("estado")
+            if aemet_estado == 404 or aemet_estado == "404":
+                logger.warning("AEMET API returned 404 error for hourly forecast",
+                             mensaje=initial_response.get("descripcion"),
+                             municipality=municipality_code)
+                return None
+
        if not self._is_valid_initial_response(initial_response):
-            logger.warning("Invalid initial response from AEMET hourly API", 
+            logger.warning("Invalid initial response from AEMET hourly API",
                         response=initial_response, municipality=municipality_code)
            return None
-        
+
        datos_url = initial_response.get("datos")
        logger.info("Fetching hourly data from AEMET datos URL", url=datos_url)
-        
+
        return await self._fetch_from_url(datos_url)
    
-    async def _fetch_historical_data_in_chunks(self, 
-                                             station_id: str, 
-                                             start_date: datetime, 
+    async def _fetch_historical_data_in_chunks(self,
+                                             station_id: str,
+                                             start_date: datetime,
                                             end_date: datetime) -> List[Dict[str, Any]]:
        """Fetch historical data in chunks due to AEMET API limitations"""
+        import asyncio
        historical_data = []
        current_date = start_date
-        
+        chunk_count = 0
+
        while current_date <= end_date:
            chunk_end_date = min(
-                current_date + timedelta(days=AEMETConstants.MAX_DAYS_PER_REQUEST), 
+                current_date + timedelta(days=AEMETConstants.MAX_DAYS_PER_REQUEST),
                end_date
            )
-            
+
+            # Add delay to respect rate limits (AEMET allows ~60 requests/minute)
+            # Wait 2 seconds between requests to stay well under the limit
+            if chunk_count > 0:
+                await asyncio.sleep(2)
+
            chunk_data = await self._fetch_historical_chunk(
                station_id, current_date, chunk_end_date
            )
-            
+
            if chunk_data:
                historical_data.extend(chunk_data)
-            
+
            current_date = chunk_end_date + timedelta(days=1)
-        
+            chunk_count += 1
+
+            # Log progress every 5 chunks
+            if chunk_count % 5 == 0:
+                logger.info("Historical data fetch progress",
+                          chunks_fetched=chunk_count,
+                          records_so_far=len(historical_data))
+
        return historical_data
    
    async def _fetch_historical_chunk(self, 
@@ -930,13 +962,37 @@ class AEMETClient(BaseAPIClient):
        """Fetch data from AEMET datos URL"""
        try:
            data = await self._fetch_url_directly(url)
-            
-            if data and isinstance(data, list):
-                return data
-            else:
-                logger.warning("Expected list from datos URL", data_type=type(data))
+
+            if data is None:
+                logger.warning("No data received from datos URL", url=url)
                return None
-                    
+
+            # Check if we got an AEMET error response (dict with estado/descripcion)
+            if isinstance(data, dict):
+                aemet_estado = data.get("estado")
+                aemet_mensaje = data.get("descripcion")
+
+                if aemet_estado or aemet_mensaje:
+                    logger.warning("AEMET datos URL returned error response",
+                                 estado=aemet_estado,
+                                 mensaje=aemet_mensaje,
+                                 url=url)
+                    return None
+                else:
+                    # It's a dict but not an error response - unexpected format
+                    logger.warning("Expected list from datos URL but got dict",
+                                 data_type=type(data),
+                                 keys=list(data.keys())[:5],
+                                 url=url)
+                    return None
+
+            if isinstance(data, list):
+                return data
+
+            logger.warning("Unexpected data type from datos URL",
+                         data_type=type(data), url=url)
+            return None
+
        except Exception as e:
            logger.error("Failed to fetch from datos URL", url=url, error=str(e))
            return None
--- a/services/external/app/external/apis/madrid_traffic_client.py
+++ b/services/external/app/external/apis/madrid_traffic_client.py
@@ -318,49 +318,86 @@ class MadridTrafficClient(BaseTrafficClient, BaseAPIClient):
    async def _process_historical_zip_enhanced(self, zip_content: bytes, zip_url: str,
                                             latitude: float, longitude: float,
                                             nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
-        """Process historical ZIP file with enhanced parsing"""
+        """Process historical ZIP file with memory-efficient streaming"""
        try:
            import zipfile
            import io
            import csv
            import gc
-            
+
            historical_records = []
            nearest_ids = {p[0] for p in nearest_points}
-            
+
            with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
                csv_files = [f for f in zip_file.namelist() if f.lower().endswith('.csv')]
-                
+
                for csv_filename in csv_files:
                    try:
-                        # Read CSV content
+                        # Stream CSV file line-by-line to avoid loading entire file into memory
                        with zip_file.open(csv_filename) as csv_file:
-                            text_content = csv_file.read().decode('utf-8', errors='ignore')
-                        
-                        # Process CSV in chunks using processor
-                        csv_records = await self.processor.process_csv_content_chunked(
-                            text_content, csv_filename, nearest_ids, nearest_points
-                        )
-                        
-                        historical_records.extend(csv_records)
-                        
-                        # Force garbage collection
+                            # Use TextIOWrapper for efficient line-by-line reading
+                            import codecs
+                            text_wrapper = codecs.iterdecode(csv_file, 'utf-8', errors='ignore')
+                            csv_reader = csv.DictReader(text_wrapper, delimiter=';')
+
+                            # Process in small batches
+                            batch_size = 5000
+                            batch_records = []
+                            row_count = 0
+
+                            for row in csv_reader:
+                                row_count += 1
+                                measurement_point_id = row.get('id', '').strip()
+
+                                # Skip rows we don't need
+                                if measurement_point_id not in nearest_ids:
+                                    continue
+
+                                try:
+                                    record_data = await self.processor.parse_historical_csv_row(row, nearest_points)
+                                    if record_data:
+                                        batch_records.append(record_data)
+
+                                        # Store and clear batch when full
+                                        if len(batch_records) >= batch_size:
+                                            historical_records.extend(batch_records)
+                                            batch_records = []
+                                            gc.collect()
+
+                                except Exception:
+                                    continue
+
+                            # Store remaining records
+                            if batch_records:
+                                historical_records.extend(batch_records)
+                                batch_records = []
+
+                            self.logger.info("CSV file processed",
+                                           filename=csv_filename,
+                                           rows_scanned=row_count,
+                                           records_extracted=len(historical_records))
+
+                        # Aggressive garbage collection after each CSV
                        gc.collect()
-                        
+
                    except Exception as csv_error:
-                        self.logger.warning("Error processing CSV file", 
-                                          filename=csv_filename, 
+                        self.logger.warning("Error processing CSV file",
+                                          filename=csv_filename,
                                          error=str(csv_error))
                        continue
-            
-            self.logger.info("Historical ZIP processing completed", 
+
+            self.logger.info("Historical ZIP processing completed",
                           zip_url=zip_url,
                           total_records=len(historical_records))
-            
+
+            # Final cleanup
+            del zip_content
+            gc.collect()
+
            return historical_records
-            
+
        except Exception as e:
-            self.logger.error("Error processing historical ZIP file", 
+            self.logger.error("Error processing historical ZIP file",
                            zip_url=zip_url, error=str(e))
            return []
    
--- a/services/external/app/external/base_client.py
+++ b/services/external/app/external/base_client.py
@@ -50,8 +50,20 @@ class BaseAPIClient:
                    return response_data
                    
            except httpx.HTTPStatusError as e:
-                logger.error("HTTP error", status_code=e.response.status_code, url=url, 
+                logger.error("HTTP error", status_code=e.response.status_code, url=url,
                           response_text=e.response.text[:200], attempt=attempt + 1)
+
+                # Handle rate limiting (429) with longer backoff
+                if e.response.status_code == 429:
+                    import asyncio
+                    # Exponential backoff: 5s, 15s, 45s for rate limits
+                    wait_time = 5 * (3 ** attempt)
+                    logger.warning(f"Rate limit hit, waiting {wait_time}s before retry",
+                                 attempt=attempt + 1, max_attempts=self.retries)
+                    await asyncio.sleep(wait_time)
+                    if attempt < self.retries - 1:
+                        continue
+
                if attempt == self.retries - 1:  # Last attempt
                    return None
            except httpx.RequestError as e:
@@ -72,51 +84,87 @@ class BaseAPIClient:
        return None
    
    async def _fetch_url_directly(self, url: str, headers: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
-        """Fetch data directly from a full URL (for AEMET datos URLs)"""
-        try:
-            request_headers = headers or {}
-            
-            logger.debug("Making direct URL request", url=url)
-            
-            async with httpx.AsyncClient(timeout=self.timeout) as client:
-                response = await client.get(url, headers=request_headers)
-                response.raise_for_status()
-                
-                # Handle encoding issues common with Spanish data sources
-                try:
-                    response_data = response.json()
-                except UnicodeDecodeError:
-                    logger.warning("UTF-8 decode failed, trying alternative encodings", url=url)
-                    # Try common Spanish encodings
-                    for encoding in ['latin-1', 'windows-1252', 'iso-8859-1']:
-                        try:
-                            text_content = response.content.decode(encoding)
-                            import json
-                            response_data = json.loads(text_content)
-                            logger.info("Successfully decoded with encoding", encoding=encoding)
-                            break
-                        except (UnicodeDecodeError, json.JSONDecodeError):
-                            continue
-                    else:
-                        logger.error("Failed to decode response with any encoding", url=url)
-                        return None
-                
-                logger.debug("Direct URL response received", 
-                           status_code=response.status_code,
-                           data_type=type(response_data),
-                           data_length=len(response_data) if isinstance(response_data, (list, dict)) else "unknown")
-                
-                return response_data
-                
-        except httpx.HTTPStatusError as e:
-            logger.error("HTTP error in direct fetch", status_code=e.response.status_code, url=url)
-            return None
-        except httpx.RequestError as e:
-            logger.error("Request error in direct fetch", error=str(e), url=url)
-            return None
-        except Exception as e:
-            logger.error("Unexpected error in direct fetch", error=str(e), url=url)
-            return None
+        """Fetch data directly from a full URL (for AEMET datos URLs) with retry logic"""
+        request_headers = headers or {}
+
+        logger.debug("Making direct URL request", url=url)
+
+        # Retry logic for unstable AEMET datos URLs
+        for attempt in range(self.retries):
+            try:
+                async with httpx.AsyncClient(timeout=self.timeout) as client:
+                    response = await client.get(url, headers=request_headers)
+                    response.raise_for_status()
+
+                    # Handle encoding issues common with Spanish data sources
+                    try:
+                        response_data = response.json()
+                    except UnicodeDecodeError:
+                        logger.warning("UTF-8 decode failed, trying alternative encodings", url=url)
+                        # Try common Spanish encodings
+                        for encoding in ['latin-1', 'windows-1252', 'iso-8859-1']:
+                            try:
+                                text_content = response.content.decode(encoding)
+                                import json
+                                response_data = json.loads(text_content)
+                                logger.info("Successfully decoded with encoding", encoding=encoding)
+                                break
+                            except (UnicodeDecodeError, json.JSONDecodeError):
+                                continue
+                        else:
+                            logger.error("Failed to decode response with any encoding", url=url)
+                            if attempt < self.retries - 1:
+                                continue
+                            return None
+
+                    logger.debug("Direct URL response received",
+                               status_code=response.status_code,
+                               data_type=type(response_data),
+                               data_length=len(response_data) if isinstance(response_data, (list, dict)) else "unknown")
+
+                    return response_data
+
+            except httpx.HTTPStatusError as e:
+                logger.error("HTTP error in direct fetch",
+                           status_code=e.response.status_code,
+                           url=url,
+                           attempt=attempt + 1)
+
+                # On last attempt, return None
+                if attempt == self.retries - 1:
+                    return None
+
+                # Wait before retry
+                import asyncio
+                wait_time = 2 ** attempt  # 1s, 2s, 4s
+                logger.info(f"Retrying datos URL in {wait_time}s",
+                          attempt=attempt + 1, max_attempts=self.retries)
+                await asyncio.sleep(wait_time)
+
+            except httpx.RequestError as e:
+                logger.error("Request error in direct fetch",
+                           error=str(e), url=url, attempt=attempt + 1)
+
+                # On last attempt, return None
+                if attempt == self.retries - 1:
+                    return None
+
+                # Wait before retry
+                import asyncio
+                wait_time = 2 ** attempt  # 1s, 2s, 4s
+                logger.info(f"Retrying datos URL in {wait_time}s",
+                          attempt=attempt + 1, max_attempts=self.retries)
+                await asyncio.sleep(wait_time)
+
+            except Exception as e:
+                logger.error("Unexpected error in direct fetch",
+                           error=str(e), url=url, attempt=attempt + 1)
+
+                # On last attempt, return None
+                if attempt == self.retries - 1:
+                    return None
+
+        return None
    
    async def _post(self, endpoint: str, data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
        """Make POST request"""