REFACTOR external service and improve websocket training
This commit is contained in:
@@ -318,49 +318,86 @@ class MadridTrafficClient(BaseTrafficClient, BaseAPIClient):
|
||||
async def _process_historical_zip_enhanced(self, zip_content: bytes, zip_url: str,
|
||||
latitude: float, longitude: float,
|
||||
nearest_points: List[Tuple[str, Dict[str, Any], float]]) -> List[Dict[str, Any]]:
|
||||
"""Process historical ZIP file with enhanced parsing"""
|
||||
"""Process historical ZIP file with memory-efficient streaming"""
|
||||
try:
|
||||
import zipfile
|
||||
import io
|
||||
import csv
|
||||
import gc
|
||||
|
||||
|
||||
historical_records = []
|
||||
nearest_ids = {p[0] for p in nearest_points}
|
||||
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
|
||||
csv_files = [f for f in zip_file.namelist() if f.lower().endswith('.csv')]
|
||||
|
||||
|
||||
for csv_filename in csv_files:
|
||||
try:
|
||||
# Read CSV content
|
||||
# Stream CSV file line-by-line to avoid loading entire file into memory
|
||||
with zip_file.open(csv_filename) as csv_file:
|
||||
text_content = csv_file.read().decode('utf-8', errors='ignore')
|
||||
|
||||
# Process CSV in chunks using processor
|
||||
csv_records = await self.processor.process_csv_content_chunked(
|
||||
text_content, csv_filename, nearest_ids, nearest_points
|
||||
)
|
||||
|
||||
historical_records.extend(csv_records)
|
||||
|
||||
# Force garbage collection
|
||||
# Use TextIOWrapper for efficient line-by-line reading
|
||||
import codecs
|
||||
text_wrapper = codecs.iterdecode(csv_file, 'utf-8', errors='ignore')
|
||||
csv_reader = csv.DictReader(text_wrapper, delimiter=';')
|
||||
|
||||
# Process in small batches
|
||||
batch_size = 5000
|
||||
batch_records = []
|
||||
row_count = 0
|
||||
|
||||
for row in csv_reader:
|
||||
row_count += 1
|
||||
measurement_point_id = row.get('id', '').strip()
|
||||
|
||||
# Skip rows we don't need
|
||||
if measurement_point_id not in nearest_ids:
|
||||
continue
|
||||
|
||||
try:
|
||||
record_data = await self.processor.parse_historical_csv_row(row, nearest_points)
|
||||
if record_data:
|
||||
batch_records.append(record_data)
|
||||
|
||||
# Store and clear batch when full
|
||||
if len(batch_records) >= batch_size:
|
||||
historical_records.extend(batch_records)
|
||||
batch_records = []
|
||||
gc.collect()
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Store remaining records
|
||||
if batch_records:
|
||||
historical_records.extend(batch_records)
|
||||
batch_records = []
|
||||
|
||||
self.logger.info("CSV file processed",
|
||||
filename=csv_filename,
|
||||
rows_scanned=row_count,
|
||||
records_extracted=len(historical_records))
|
||||
|
||||
# Aggressive garbage collection after each CSV
|
||||
gc.collect()
|
||||
|
||||
|
||||
except Exception as csv_error:
|
||||
self.logger.warning("Error processing CSV file",
|
||||
filename=csv_filename,
|
||||
self.logger.warning("Error processing CSV file",
|
||||
filename=csv_filename,
|
||||
error=str(csv_error))
|
||||
continue
|
||||
|
||||
self.logger.info("Historical ZIP processing completed",
|
||||
|
||||
self.logger.info("Historical ZIP processing completed",
|
||||
zip_url=zip_url,
|
||||
total_records=len(historical_records))
|
||||
|
||||
|
||||
# Final cleanup
|
||||
del zip_content
|
||||
gc.collect()
|
||||
|
||||
return historical_records
|
||||
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error("Error processing historical ZIP file",
|
||||
self.logger.error("Error processing historical ZIP file",
|
||||
zip_url=zip_url, error=str(e))
|
||||
return []
|
||||
|
||||
|
||||
Reference in New Issue
Block a user