REFACTOR external service and improve websocket training

2025-10-09 14:11:02 +02:00
parent 7c72f83c51
commit 3c689b4f98
111 changed files with 13289 additions and 2374 deletions
--- a/services/external/app/external/apis/madrid_traffic_client.py
+++ b/services/external/app/external/apis/madrid_traffic_client.py
@@ -318,49 +318,86 @@ class MadridTrafficClient(BaseTrafficClient, BaseAPIClient):
    async def _process_historical_zip_enhanced(self, zip_content: bytes, zip_url: str,
                                             latitude: float, longitude: float,
                                             nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
-        """Process historical ZIP file with enhanced parsing"""
+        """Process historical ZIP file with memory-efficient streaming"""
        try:
            import zipfile
            import io
            import csv
            import gc
-            
+
            historical_records = []
            nearest_ids = {p[0] for p in nearest_points}
-            
+
            with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
                csv_files = [f for f in zip_file.namelist() if f.lower().endswith('.csv')]
-                
+
                for csv_filename in csv_files:
                    try:
-                        # Read CSV content
+                        # Stream CSV file line-by-line to avoid loading entire file into memory
                        with zip_file.open(csv_filename) as csv_file:
-                            text_content = csv_file.read().decode('utf-8', errors='ignore')
-                        
-                        # Process CSV in chunks using processor
-                        csv_records = await self.processor.process_csv_content_chunked(
-                            text_content, csv_filename, nearest_ids, nearest_points
-                        )
-                        
-                        historical_records.extend(csv_records)
-                        
-                        # Force garbage collection
+                            # Use TextIOWrapper for efficient line-by-line reading
+                            import codecs
+                            text_wrapper = codecs.iterdecode(csv_file, 'utf-8', errors='ignore')
+                            csv_reader = csv.DictReader(text_wrapper, delimiter=';')
+
+                            # Process in small batches
+                            batch_size = 5000
+                            batch_records = []
+                            row_count = 0
+
+                            for row in csv_reader:
+                                row_count += 1
+                                measurement_point_id = row.get('id', '').strip()
+
+                                # Skip rows we don't need
+                                if measurement_point_id not in nearest_ids:
+                                    continue
+
+                                try:
+                                    record_data = await self.processor.parse_historical_csv_row(row, nearest_points)
+                                    if record_data:
+                                        batch_records.append(record_data)
+
+                                        # Store and clear batch when full
+                                        if len(batch_records) >= batch_size:
+                                            historical_records.extend(batch_records)
+                                            batch_records = []
+                                            gc.collect()
+
+                                except Exception:
+                                    continue
+
+                            # Store remaining records
+                            if batch_records:
+                                historical_records.extend(batch_records)
+                                batch_records = []
+
+                            self.logger.info("CSV file processed",
+                                           filename=csv_filename,
+                                           rows_scanned=row_count,
+                                           records_extracted=len(historical_records))
+
+                        # Aggressive garbage collection after each CSV
                        gc.collect()
-                        
+
                    except Exception as csv_error:
-                        self.logger.warning("Error processing CSV file", 
-                                          filename=csv_filename, 
+                        self.logger.warning("Error processing CSV file",
+                                          filename=csv_filename,
                                          error=str(csv_error))
                        continue
-            
-            self.logger.info("Historical ZIP processing completed", 
+
+            self.logger.info("Historical ZIP processing completed",
                           zip_url=zip_url,
                           total_records=len(historical_records))
-            
+
+            # Final cleanup
+            del zip_content
+            gc.collect()
+
            return historical_records
-            
+
        except Exception as e:
-            self.logger.error("Error processing historical ZIP file", 
+            self.logger.error("Error processing historical ZIP file",
                            zip_url=zip_url, error=str(e))
            return []