Files
bakery-ia/services/data/app/services/data_import_service.py
2025-08-08 09:08:41 +02:00

781 lines
32 KiB
Python

"""
Enhanced Data Import Service
Service for importing sales data using repository pattern and enhanced error handling
"""
import csv
import io
import json
import base64
import pandas as pd
from typing import Dict, Any, List, Optional, Union
from datetime import datetime, timezone
import structlog
import re
from app.repositories.sales_repository import SalesRepository
from app.models.sales import SalesData
from app.schemas.sales import SalesDataCreate, SalesImportResult, SalesValidationResult
from shared.database.unit_of_work import UnitOfWork
from shared.database.transactions import transactional
from shared.database.exceptions import DatabaseError, ValidationError
logger = structlog.get_logger()
class EnhancedDataImportService:
"""Enhanced data import service using repository pattern"""
# Common column mappings for different languages/formats
COLUMN_MAPPINGS = {
'date': ['date', 'fecha', 'datum', 'data', 'dia'],
'datetime': ['datetime', 'fecha_hora', 'timestamp'],
'product': ['product', 'producto', 'item', 'articulo', 'nombre', 'name'],
'product_name': ['product_name', 'nombre_producto', 'item_name'],
'quantity': ['quantity', 'cantidad', 'qty', 'units', 'unidades'],
'quantity_sold': ['quantity_sold', 'cantidad_vendida', 'sold'],
'revenue': ['revenue', 'ingresos', 'sales', 'ventas', 'total', 'importe'],
'price': ['price', 'precio', 'cost', 'coste'],
'location': ['location', 'ubicacion', 'tienda', 'store', 'punto_venta'],
'location_id': ['location_id', 'store_id', 'tienda_id'],
}
DATE_FORMATS = [
'%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%m-%d-%Y',
'%d.%m.%Y', '%Y/%m/%d', '%d/%m/%y', '%m/%d/%y',
'%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M',
]
def __init__(self, database_manager):
"""Initialize service with database manager"""
self.database_manager = database_manager
async def validate_import_data(self, data: Dict[str, Any]) -> SalesValidationResult:
"""Validate import data before processing"""
try:
logger.info("Starting import data validation", tenant_id=data.get("tenant_id"))
validation_result = SalesValidationResult(
is_valid=True,
total_records=0,
valid_records=0,
invalid_records=0,
errors=[],
warnings=[],
summary={}
)
errors = []
warnings = []
# Basic validation checks
if not data.get("tenant_id"):
errors.append({
"type": "missing_field",
"message": "tenant_id es requerido",
"field": "tenant_id",
"row": None,
"code": "MISSING_TENANT_ID"
})
if not data.get("data"):
errors.append({
"type": "missing_data",
"message": "Datos de archivo faltantes",
"field": "data",
"row": None,
"code": "NO_DATA_PROVIDED"
})
validation_result.is_valid = False
validation_result.errors = errors
validation_result.summary = {
"status": "failed",
"reason": "no_data_provided",
"file_format": data.get("data_format", "unknown"),
"suggestions": ["Selecciona un archivo válido para importar"]
}
return validation_result
# Validate file format
format_type = data.get("data_format", "").lower()
supported_formats = ["csv", "excel", "xlsx", "xls", "json", "pos"]
if format_type not in supported_formats:
errors.append({
"type": "unsupported_format",
"message": f"Formato no soportado: {format_type}",
"field": "data_format",
"row": None,
"code": "UNSUPPORTED_FORMAT"
})
# Validate data size
data_content = data.get("data", "")
data_size = len(data_content)
if data_size == 0:
errors.append({
"type": "empty_file",
"message": "El archivo está vacío",
"field": "data",
"row": None,
"code": "EMPTY_FILE"
})
elif data_size > 10 * 1024 * 1024: # 10MB limit
errors.append({
"type": "file_too_large",
"message": "Archivo demasiado grande (máximo 10MB)",
"field": "data",
"row": None,
"code": "FILE_TOO_LARGE"
})
elif data_size > 1024 * 1024: # 1MB warning
warnings.append({
"type": "large_file",
"message": "Archivo grande detectado. El procesamiento puede tomar más tiempo.",
"field": "data",
"row": None,
"code": "LARGE_FILE_WARNING"
})
# Analyze CSV content if format is CSV
if format_type == "csv" and data_content and not errors:
try:
reader = csv.DictReader(io.StringIO(data_content))
rows = list(reader)
validation_result.total_records = len(rows)
if not rows:
errors.append({
"type": "empty_content",
"message": "El archivo CSV no contiene datos",
"field": "data",
"row": None,
"code": "NO_CONTENT"
})
else:
# Analyze structure
headers = list(rows[0].keys()) if rows else []
column_mapping = self._detect_columns(headers)
# Check for required columns
if not column_mapping.get('date'):
errors.append({
"type": "missing_column",
"message": "Columna de fecha no encontrada",
"field": "date",
"row": None,
"code": "MISSING_DATE_COLUMN"
})
if not column_mapping.get('product'):
errors.append({
"type": "missing_column",
"message": "Columna de producto no encontrada",
"field": "product",
"row": None,
"code": "MISSING_PRODUCT_COLUMN"
})
if not column_mapping.get('quantity'):
warnings.append({
"type": "missing_column",
"message": "Columna de cantidad no encontrada, se usará 1 por defecto",
"field": "quantity",
"row": None,
"code": "MISSING_QUANTITY_COLUMN"
})
# Calculate estimated valid/invalid records
if not errors:
estimated_invalid = max(0, int(validation_result.total_records * 0.1))
validation_result.valid_records = validation_result.total_records - estimated_invalid
validation_result.invalid_records = estimated_invalid
else:
validation_result.valid_records = 0
validation_result.invalid_records = validation_result.total_records
except Exception as csv_error:
logger.warning("CSV analysis failed", error=str(csv_error))
warnings.append({
"type": "analysis_warning",
"message": f"No se pudo analizar completamente el CSV: {str(csv_error)}",
"field": "data",
"row": None,
"code": "CSV_ANALYSIS_WARNING"
})
# Set validation result
validation_result.is_valid = len(errors) == 0
validation_result.errors = errors
validation_result.warnings = warnings
# Build summary
validation_result.summary = {
"status": "valid" if validation_result.is_valid else "invalid",
"file_format": format_type,
"file_size_bytes": data_size,
"file_size_mb": round(data_size / (1024 * 1024), 2),
"estimated_processing_time_seconds": max(1, validation_result.total_records // 100),
"validation_timestamp": datetime.utcnow().isoformat(),
"suggestions": self._generate_suggestions(validation_result, format_type, len(warnings))
}
logger.info("Import validation completed",
is_valid=validation_result.is_valid,
total_records=validation_result.total_records,
error_count=len(errors),
warning_count=len(warnings))
return validation_result
except Exception as e:
logger.error("Validation process failed", error=str(e))
return SalesValidationResult(
is_valid=False,
total_records=0,
valid_records=0,
invalid_records=0,
errors=[{
"type": "system_error",
"message": f"Error en el proceso de validación: {str(e)}",
"field": None,
"row": None,
"code": "SYSTEM_ERROR"
}],
warnings=[],
summary={
"status": "error",
"file_format": data.get("data_format", "unknown"),
"error_type": "system_error",
"suggestions": [
"Intenta de nuevo con un archivo diferente",
"Contacta soporte si el problema persiste"
]
}
)
async def process_import(
self,
tenant_id: str,
content: str,
file_format: str,
filename: Optional[str] = None,
session = None
) -> SalesImportResult:
"""Process data import using repository pattern"""
start_time = datetime.utcnow()
try:
logger.info("Starting data import using repository pattern",
filename=filename,
format=file_format,
tenant_id=tenant_id)
async with self.database_manager.get_session() as db_session:
async with UnitOfWork(db_session) as uow:
# Register sales repository
sales_repo = uow.register_repository("sales", SalesRepository, SalesData)
# Process data based on format
if file_format.lower() == 'csv':
result = await self._process_csv_data(tenant_id, content, sales_repo, filename)
elif file_format.lower() == 'json':
result = await self._process_json_data(tenant_id, content, sales_repo, filename)
elif file_format.lower() in ['excel', 'xlsx']:
result = await self._process_excel_data(tenant_id, content, sales_repo, filename)
else:
raise ValidationError(f"Unsupported format: {file_format}")
# Commit all changes
await uow.commit()
# Calculate processing time
end_time = datetime.utcnow()
processing_time = (end_time - start_time).total_seconds()
# Build final result
final_result = SalesImportResult(
success=result.get("success", False),
records_processed=result.get("total_rows", 0),
records_created=result.get("records_created", 0),
records_updated=0, # We don't update, only create
records_failed=result.get("total_rows", 0) - result.get("records_created", 0),
errors=self._structure_messages(result.get("errors", [])),
warnings=self._structure_messages(result.get("warnings", [])),
processing_time_seconds=processing_time
)
logger.info("Data import completed successfully",
records_created=final_result.records_created,
processing_time=processing_time)
return final_result
except (ValidationError, DatabaseError):
raise
except Exception as e:
end_time = datetime.utcnow()
processing_time = (end_time - start_time).total_seconds()
logger.error("Data import failed", error=str(e), tenant_id=tenant_id)
return SalesImportResult(
success=False,
records_processed=0,
records_created=0,
records_updated=0,
records_failed=0,
errors=[{
"type": "import_error",
"message": f"Import failed: {str(e)}",
"field": None,
"row": None,
"code": "IMPORT_FAILURE"
}],
warnings=[],
processing_time_seconds=processing_time
)
async def _process_csv_data(
self,
tenant_id: str,
csv_content: str,
sales_repo: SalesRepository,
filename: Optional[str] = None
) -> Dict[str, Any]:
"""Process CSV data using repository"""
try:
reader = csv.DictReader(io.StringIO(csv_content))
rows = list(reader)
if not rows:
return {
"success": False,
"total_rows": 0,
"records_created": 0,
"errors": ["CSV file is empty"],
"warnings": []
}
# Column mapping
column_mapping = self._detect_columns(list(rows[0].keys()))
records_created = 0
errors = []
warnings = []
logger.info(f"Processing {len(rows)} records from CSV")
for index, row in enumerate(rows):
try:
# Parse and validate data
parsed_data = await self._parse_row_data(row, column_mapping, index + 1)
if parsed_data.get("skip"):
errors.extend(parsed_data.get("errors", []))
warnings.extend(parsed_data.get("warnings", []))
continue
# Create sales record using repository
record_data = {
"tenant_id": tenant_id,
"date": parsed_data["date"],
"product_name": parsed_data["product_name"],
"quantity_sold": parsed_data["quantity_sold"],
"revenue": parsed_data.get("revenue"),
"location_id": parsed_data.get("location_id"),
"source": "csv"
}
await sales_repo.create(record_data)
records_created += 1
# Log progress for large imports
if records_created % 100 == 0:
logger.info(f"Processed {records_created} records...")
except Exception as e:
error_msg = f"Row {index + 1}: {str(e)}"
errors.append(error_msg)
logger.warning("Record processing failed", error=error_msg)
success_rate = (records_created / len(rows)) * 100 if rows else 0
return {
"success": records_created > 0,
"total_rows": len(rows),
"records_created": records_created,
"success_rate": success_rate,
"errors": errors,
"warnings": warnings
}
except Exception as e:
logger.error("CSV processing failed", error=str(e))
raise DatabaseError(f"CSV processing error: {str(e)}")
async def _process_json_data(
self,
tenant_id: str,
json_content: str,
sales_repo: SalesRepository,
filename: Optional[str] = None
) -> Dict[str, Any]:
"""Process JSON data using repository"""
try:
# Parse JSON
if json_content.startswith('data:'):
json_content = base64.b64decode(json_content.split(',')[1]).decode('utf-8')
data = json.loads(json_content)
# Handle different JSON structures
if isinstance(data, dict):
if 'data' in data:
records = data['data']
elif 'records' in data:
records = data['records']
elif 'sales' in data:
records = data['sales']
else:
records = [data] # Single record
elif isinstance(data, list):
records = data
else:
raise ValidationError("Invalid JSON format")
# Convert to DataFrame for consistent processing
df = pd.DataFrame(records)
df.columns = df.columns.str.strip().str.lower()
return await self._process_dataframe(tenant_id, df, sales_repo, "json", filename)
except json.JSONDecodeError as e:
raise ValidationError(f"Invalid JSON: {str(e)}")
except Exception as e:
logger.error("JSON processing failed", error=str(e))
raise DatabaseError(f"JSON processing error: {str(e)}")
async def _process_excel_data(
self,
tenant_id: str,
excel_content: str,
sales_repo: SalesRepository,
filename: Optional[str] = None
) -> Dict[str, Any]:
"""Process Excel data using repository"""
try:
# Decode base64 content
if excel_content.startswith('data:'):
excel_bytes = base64.b64decode(excel_content.split(',')[1])
else:
excel_bytes = base64.b64decode(excel_content)
# Read Excel file
df = pd.read_excel(io.BytesIO(excel_bytes), sheet_name=0)
# Clean column names
df.columns = df.columns.str.strip().str.lower()
# Remove empty rows
df = df.dropna(how='all')
return await self._process_dataframe(tenant_id, df, sales_repo, "excel", filename)
except Exception as e:
logger.error("Excel processing failed", error=str(e))
raise DatabaseError(f"Excel processing error: {str(e)}")
async def _process_dataframe(
self,
tenant_id: str,
df: pd.DataFrame,
sales_repo: SalesRepository,
source: str,
filename: Optional[str] = None
) -> Dict[str, Any]:
"""Process DataFrame using repository"""
try:
# Map columns
column_mapping = self._detect_columns(df.columns.tolist())
if not column_mapping.get('date') or not column_mapping.get('product'):
required_missing = []
if not column_mapping.get('date'):
required_missing.append("date")
if not column_mapping.get('product'):
required_missing.append("product")
raise ValidationError(f"Required columns missing: {', '.join(required_missing)}")
records_created = 0
errors = []
warnings = []
logger.info(f"Processing {len(df)} records from {source}")
for index, row in df.iterrows():
try:
# Convert pandas row to dict
row_dict = {}
for col in df.columns:
row_dict[col] = row[col]
# Parse and validate data
parsed_data = await self._parse_row_data(row_dict, column_mapping, index + 1)
if parsed_data.get("skip"):
errors.extend(parsed_data.get("errors", []))
warnings.extend(parsed_data.get("warnings", []))
continue
# Create sales record using repository
record_data = {
"tenant_id": tenant_id,
"date": parsed_data["date"],
"product_name": parsed_data["product_name"],
"quantity_sold": parsed_data["quantity_sold"],
"revenue": parsed_data.get("revenue"),
"location_id": parsed_data.get("location_id"),
"source": source
}
await sales_repo.create(record_data)
records_created += 1
# Log progress for large imports
if records_created % 100 == 0:
logger.info(f"Processed {records_created} records...")
except Exception as e:
error_msg = f"Row {index + 1}: {str(e)}"
errors.append(error_msg)
logger.warning("Record processing failed", error=error_msg)
success_rate = (records_created / len(df)) * 100 if len(df) > 0 else 0
return {
"success": records_created > 0,
"total_rows": len(df),
"records_created": records_created,
"success_rate": success_rate,
"errors": errors[:10], # Limit errors
"warnings": warnings[:10] # Limit warnings
}
except ValidationError:
raise
except Exception as e:
logger.error("DataFrame processing failed", error=str(e))
raise DatabaseError(f"Data processing error: {str(e)}")
async def _parse_row_data(
self,
row: Dict[str, Any],
column_mapping: Dict[str, str],
row_number: int
) -> Dict[str, Any]:
"""Parse and validate row data"""
errors = []
warnings = []
try:
# Extract and validate date
date_str = str(row.get(column_mapping.get('date', ''), '')).strip()
if not date_str or date_str.lower() in ['nan', 'null', 'none', '']:
errors.append(f"Row {row_number}: Missing date")
return {"skip": True, "errors": errors, "warnings": warnings}
parsed_date = self._parse_date(date_str)
if not parsed_date:
errors.append(f"Row {row_number}: Invalid date format: {date_str}")
return {"skip": True, "errors": errors, "warnings": warnings}
# Extract and validate product name
product_name = str(row.get(column_mapping.get('product', ''), '')).strip()
if not product_name or product_name.lower() in ['nan', 'null', 'none', '']:
errors.append(f"Row {row_number}: Missing product name")
return {"skip": True, "errors": errors, "warnings": warnings}
product_name = self._clean_product_name(product_name)
# Extract and validate quantity
quantity_raw = row.get(column_mapping.get('quantity', 'quantity'), 1)
try:
quantity = int(float(str(quantity_raw).replace(',', '.')))
if quantity <= 0:
warnings.append(f"Row {row_number}: Invalid quantity ({quantity}), using 1")
quantity = 1
except (ValueError, TypeError):
warnings.append(f"Row {row_number}: Invalid quantity ({quantity_raw}), using 1")
quantity = 1
# Extract revenue (optional)
revenue = None
if 'revenue' in column_mapping and column_mapping['revenue'] in row:
revenue_raw = row.get(column_mapping['revenue'])
if revenue_raw and str(revenue_raw).lower() not in ['nan', 'null', 'none', '']:
try:
revenue = float(str(revenue_raw).replace(',', '.').replace('', '').replace('$', '').strip())
if revenue < 0:
revenue = None
warnings.append(f"Row {row_number}: Negative revenue ignored")
except (ValueError, TypeError):
warnings.append(f"Row {row_number}: Invalid revenue format: {revenue_raw}")
# Extract location (optional)
location_id = None
if 'location' in column_mapping and column_mapping['location'] in row:
location_raw = row.get(column_mapping['location'])
if location_raw and str(location_raw).lower() not in ['nan', 'null', 'none', '']:
location_id = str(location_raw).strip()
return {
"skip": False,
"date": parsed_date,
"product_name": product_name,
"quantity_sold": quantity,
"revenue": revenue,
"location_id": location_id,
"errors": errors,
"warnings": warnings
}
except Exception as e:
errors.append(f"Row {row_number}: Parsing error: {str(e)}")
return {"skip": True, "errors": errors, "warnings": warnings}
def _detect_columns(self, columns: List[str]) -> Dict[str, str]:
"""Detect column mappings using fuzzy matching"""
mapping = {}
columns_lower = [col.lower() for col in columns]
for standard_name, possible_names in self.COLUMN_MAPPINGS.items():
for col in columns_lower:
for possible in possible_names:
if possible in col or col in possible:
mapping[standard_name] = columns[columns_lower.index(col)]
break
if standard_name in mapping:
break
# Map common aliases
if 'product' not in mapping and 'product_name' in mapping:
mapping['product'] = mapping['product_name']
if 'quantity' not in mapping and 'quantity_sold' in mapping:
mapping['quantity'] = mapping['quantity_sold']
if 'location' not in mapping and 'location_id' in mapping:
mapping['location'] = mapping['location_id']
return mapping
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""Parse date string with multiple format attempts"""
if not date_str or str(date_str).lower() in ['nan', 'null', 'none']:
return None
date_str = str(date_str).strip()
# Try pandas first
try:
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
if hasattr(parsed_dt, 'to_pydatetime'):
parsed_dt = parsed_dt.to_pydatetime()
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
except Exception:
pass
# Try specific formats
for fmt in self.DATE_FORMATS:
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
except ValueError:
continue
return None
def _clean_product_name(self, product_name: str) -> str:
"""Clean and standardize product names"""
if not product_name:
return "Producto sin nombre"
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', str(product_name).strip())
# Remove special characters but keep Spanish characters
cleaned = re.sub(r'[^\w\s\-áéíóúñçüÁÉÍÓÚÑÇÜ]', '', cleaned)
# Capitalize first letter of each word
cleaned = cleaned.title()
# Common corrections for Spanish bakeries
replacements = {
'Pan De': 'Pan de',
'Café Con': 'Café con',
'Te ': '',
'Bocadillo De': 'Bocadillo de',
}
for old, new in replacements.items():
cleaned = cleaned.replace(old, new)
return cleaned if cleaned else "Producto sin nombre"
def _structure_messages(self, messages: List[Union[str, Dict]]) -> List[Dict[str, Any]]:
"""Convert string messages to structured format"""
structured = []
for msg in messages:
if isinstance(msg, str):
structured.append({
"type": "general_message",
"message": msg,
"field": None,
"row": None,
"code": "GENERAL_MESSAGE"
})
else:
structured.append(msg)
return structured
def _generate_suggestions(
self,
validation_result: SalesValidationResult,
format_type: str,
warning_count: int
) -> List[str]:
"""Generate contextual suggestions based on validation results"""
suggestions = []
if validation_result.is_valid:
suggestions.append("El archivo está listo para procesamiento")
suggestions.append(f"Se procesarán aproximadamente {validation_result.total_records} registros")
if validation_result.total_records > 1000:
suggestions.append("Archivo grande: el procesamiento puede tomar varios minutos")
if warning_count > 0:
suggestions.append("Revisa las advertencias antes de continuar")
else:
suggestions.append("Corrige los errores antes de continuar")
suggestions.append("Verifica que el archivo tenga el formato correcto")
if format_type not in ["csv", "excel", "xlsx", "json"]:
suggestions.append("Usa formato CSV o Excel")
if validation_result.total_records == 0:
suggestions.append("Asegúrate de que el archivo contenga datos")
return suggestions
# Legacy compatibility alias
DataImportService = EnhancedDataImportService