Files
bakery-ia/services/sales/app/services/data_import_service.py

1064 lines
48 KiB
Python
Raw Normal View History

2025-08-12 18:17:30 +02:00
# services/sales/app/services/data_import_service.py
2025-08-08 09:08:41 +02:00
"""
2025-08-12 18:17:30 +02:00
Data Import Service
2025-08-08 09:08:41 +02:00
Service for importing sales data using repository pattern and enhanced error handling
"""
2025-07-18 11:51:43 +02:00
import csv
import io
import json
import base64
import pandas as pd
from typing import Dict, Any, List, Optional, Union
2025-08-08 09:08:41 +02:00
from datetime import datetime, timezone
2025-08-14 13:26:59 +02:00
from uuid import UUID
2025-07-18 11:51:43 +02:00
import structlog
import re
2025-08-14 13:26:59 +02:00
import asyncio
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
from app.repositories.sales_repository import SalesRepository
from app.models.sales import SalesData
2025-08-12 18:17:30 +02:00
from app.schemas.sales import SalesDataCreate
from app.core.database import get_db_transaction
2025-08-14 13:26:59 +02:00
from app.services.inventory_client import InventoryServiceClient
2025-07-18 11:51:43 +02:00
logger = structlog.get_logger()
2025-08-08 09:08:41 +02:00
2025-08-12 18:17:30 +02:00
# Import result schemas (dataclass definitions)
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class SalesValidationResult:
is_valid: bool
total_records: int
valid_records: int
invalid_records: int
errors: List[Dict[str, Any]]
warnings: List[Dict[str, Any]]
summary: Dict[str, Any]
@dataclass
class SalesImportResult:
success: bool
records_processed: int
records_created: int
records_updated: int
records_failed: int
errors: List[Dict[str, Any]]
warnings: List[Dict[str, Any]]
processing_time_seconds: float
class DataImportService:
"""Enhanced data import service using repository pattern with STRICT validation for production"""
# PRODUCTION VALIDATION CONFIGURATION
STRICT_VALIDATION = True # Set to False for lenient validation, True for production quality
MAX_QUANTITY_PER_DAY = 10000 # Maximum reasonable quantity per product per day
MAX_REVENUE_PER_ITEM = 100000 # Maximum reasonable revenue per line item
MAX_UNIT_PRICE = 10000 # Maximum reasonable price per unit for bakery items
2025-07-18 11:51:43 +02:00
# Common column mappings for different languages/formats
COLUMN_MAPPINGS = {
'date': ['date', 'fecha', 'datum', 'data', 'dia'],
'datetime': ['datetime', 'fecha_hora', 'timestamp'],
'product': ['product', 'producto', 'item', 'articulo', 'nombre', 'name'],
'product_name': ['product_name', 'nombre_producto', 'item_name'],
'quantity': ['quantity', 'cantidad', 'qty', 'units', 'unidades'],
'quantity_sold': ['quantity_sold', 'cantidad_vendida', 'sold'],
'revenue': ['revenue', 'ingresos', 'sales', 'ventas', 'total', 'importe'],
'price': ['price', 'precio', 'cost', 'coste'],
'location': ['location', 'ubicacion', 'tienda', 'store', 'punto_venta'],
'location_id': ['location_id', 'store_id', 'tienda_id'],
}
DATE_FORMATS = [
2025-08-08 09:08:41 +02:00
'%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%m-%d-%Y',
'%d.%m.%Y', '%Y/%m/%d', '%d/%m/%y', '%m/%d/%y',
'%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M',
2025-07-18 11:51:43 +02:00
]
2025-08-12 18:17:30 +02:00
def __init__(self):
"""Initialize enhanced import service"""
2025-08-14 13:26:59 +02:00
self.inventory_client = InventoryServiceClient()
# Product resolution cache for the import session
self.product_cache = {} # product_name -> inventory_product_id
self.failed_products = set() # Track products that failed to resolve
2025-08-08 09:08:41 +02:00
async def validate_import_data(self, data: Dict[str, Any]) -> SalesValidationResult:
2025-08-12 18:17:30 +02:00
"""Enhanced validation with better error handling and suggestions"""
2025-07-18 11:51:43 +02:00
try:
2025-08-12 18:17:30 +02:00
logger.info("Starting enhanced import data validation", tenant_id=data.get("tenant_id"))
2025-08-08 09:08:41 +02:00
validation_result = SalesValidationResult(
is_valid=True,
total_records=0,
valid_records=0,
invalid_records=0,
errors=[],
warnings=[],
summary={}
)
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
errors = []
warnings = []
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
# Basic validation checks
if not data.get("tenant_id"):
errors.append({
"type": "missing_field",
"message": "tenant_id es requerido",
"field": "tenant_id",
"row": None,
"code": "MISSING_TENANT_ID"
})
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
if not data.get("data"):
errors.append({
"type": "missing_data",
"message": "Datos de archivo faltantes",
"field": "data",
"row": None,
"code": "NO_DATA_PROVIDED"
})
validation_result.is_valid = False
validation_result.errors = errors
validation_result.summary = {
"status": "failed",
"reason": "no_data_provided",
"file_format": data.get("data_format", "unknown"),
"suggestions": ["Selecciona un archivo válido para importar"]
}
return validation_result
# Validate file format
format_type = data.get("data_format", "").lower()
supported_formats = ["csv", "excel", "xlsx", "xls", "json", "pos"]
if format_type not in supported_formats:
errors.append({
"type": "unsupported_format",
"message": f"Formato no soportado: {format_type}",
"field": "data_format",
"row": None,
"code": "UNSUPPORTED_FORMAT"
})
# Validate data size
data_content = data.get("data", "")
data_size = len(data_content)
if data_size == 0:
errors.append({
"type": "empty_file",
"message": "El archivo está vacío",
"field": "data",
"row": None,
"code": "EMPTY_FILE"
})
elif data_size > 10 * 1024 * 1024: # 10MB limit
errors.append({
"type": "file_too_large",
"message": "Archivo demasiado grande (máximo 10MB)",
"field": "data",
"row": None,
"code": "FILE_TOO_LARGE"
})
elif data_size > 1024 * 1024: # 1MB warning
warnings.append({
"type": "large_file",
"message": "Archivo grande detectado. El procesamiento puede tomar más tiempo.",
"field": "data",
"row": None,
"code": "LARGE_FILE_WARNING"
})
# Analyze CSV content if format is CSV
if format_type == "csv" and data_content and not errors:
try:
reader = csv.DictReader(io.StringIO(data_content))
rows = list(reader)
validation_result.total_records = len(rows)
if not rows:
errors.append({
"type": "empty_content",
"message": "El archivo CSV no contiene datos",
"field": "data",
"row": None,
"code": "NO_CONTENT"
})
else:
2025-08-12 18:17:30 +02:00
# Enhanced column analysis
2025-08-08 09:08:41 +02:00
headers = list(rows[0].keys()) if rows else []
column_mapping = self._detect_columns(headers)
# Check for required columns
if not column_mapping.get('date'):
errors.append({
"type": "missing_column",
"message": "Columna de fecha no encontrada",
"field": "date",
"row": None,
"code": "MISSING_DATE_COLUMN"
})
if not column_mapping.get('product'):
errors.append({
"type": "missing_column",
"message": "Columna de producto no encontrada",
"field": "product",
"row": None,
"code": "MISSING_PRODUCT_COLUMN"
})
if not column_mapping.get('quantity'):
warnings.append({
"type": "missing_column",
"message": "Columna de cantidad no encontrada, se usará 1 por defecto",
"field": "quantity",
"row": None,
"code": "MISSING_QUANTITY_COLUMN"
})
2025-08-12 18:17:30 +02:00
# Enhanced data quality estimation
2025-08-08 09:08:41 +02:00
if not errors:
2025-08-12 18:17:30 +02:00
sample_size = min(10, len(rows))
sample_rows = rows[:sample_size]
quality_issues = 0
for i, row in enumerate(sample_rows):
parsed_data = await self._parse_row_data(row, column_mapping, i + 1)
if parsed_data.get("skip") or parsed_data.get("errors"):
quality_issues += 1
estimated_error_rate = (quality_issues / sample_size) * 100 if sample_size > 0 else 0
estimated_invalid = int(validation_result.total_records * estimated_error_rate / 100)
2025-08-08 09:08:41 +02:00
validation_result.valid_records = validation_result.total_records - estimated_invalid
validation_result.invalid_records = estimated_invalid
2025-08-12 18:17:30 +02:00
# STRICT: Any data quality issues should fail validation for production
if estimated_error_rate > 0:
errors.append({
"type": "data_quality_error",
"message": f"Falló la validación de calidad: {estimated_error_rate:.0f}% de los datos tienen errores críticos",
"field": "data",
"row": None,
"code": "DATA_QUALITY_FAILED"
})
# Add specific error details
if estimated_error_rate > 50:
errors.append({
"type": "data_quality_critical",
"message": f"Calidad de datos crítica: más del 50% de los registros tienen errores",
"field": "data",
"row": None,
"code": "DATA_QUALITY_CRITICAL"
})
elif estimated_error_rate > 20:
errors.append({
"type": "data_quality_high",
"message": f"Alta tasa de errores detectada: {estimated_error_rate:.0f}% de los datos requieren corrección",
"field": "data",
"row": None,
"code": "DATA_QUALITY_HIGH_ERROR_RATE"
})
else:
# Even small error rates are now treated as validation failures
errors.append({
"type": "data_quality_detected",
"message": f"Se detectaron errores de validación en {estimated_error_rate:.0f}% de los datos",
"field": "data",
"row": None,
"code": "DATA_QUALITY_ERRORS_FOUND"
})
2025-08-08 09:08:41 +02:00
else:
validation_result.valid_records = 0
validation_result.invalid_records = validation_result.total_records
except Exception as csv_error:
2025-08-12 18:17:30 +02:00
logger.warning("Enhanced CSV analysis failed", error=str(csv_error))
2025-08-08 09:08:41 +02:00
warnings.append({
"type": "analysis_warning",
"message": f"No se pudo analizar completamente el CSV: {str(csv_error)}",
"field": "data",
2025-07-19 12:51:28 +02:00
"row": None,
2025-08-08 09:08:41 +02:00
"code": "CSV_ANALYSIS_WARNING"
2025-07-19 12:51:28 +02:00
})
2025-08-08 09:08:41 +02:00
# Set validation result
validation_result.is_valid = len(errors) == 0
validation_result.errors = errors
validation_result.warnings = warnings
2025-07-19 12:51:28 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced summary generation
2025-08-08 09:08:41 +02:00
validation_result.summary = {
"status": "valid" if validation_result.is_valid else "invalid",
"file_format": format_type,
"file_size_bytes": data_size,
"file_size_mb": round(data_size / (1024 * 1024), 2),
"estimated_processing_time_seconds": max(1, validation_result.total_records // 100),
"validation_timestamp": datetime.utcnow().isoformat(),
2025-08-12 18:17:30 +02:00
"detected_columns": list(self._detect_columns(list(csv.DictReader(io.StringIO(data_content)).fieldnames or [])).keys()) if format_type == "csv" and data_content else [],
2025-08-08 09:08:41 +02:00
"suggestions": self._generate_suggestions(validation_result, format_type, len(warnings))
2025-07-19 12:51:28 +02:00
}
2025-08-12 18:17:30 +02:00
logger.info("Enhanced import validation completed",
2025-08-08 09:08:41 +02:00
is_valid=validation_result.is_valid,
total_records=validation_result.total_records,
error_count=len(errors),
warning_count=len(warnings))
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
return validation_result
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
logger.error("Enhanced validation process failed", error=str(e))
2025-08-08 09:08:41 +02:00
return SalesValidationResult(
is_valid=False,
total_records=0,
valid_records=0,
invalid_records=0,
errors=[{
"type": "system_error",
"message": f"Error en el proceso de validación: {str(e)}",
"field": None,
"row": None,
"code": "SYSTEM_ERROR"
}],
warnings=[],
summary={
"status": "error",
"file_format": data.get("data_format", "unknown"),
"error_type": "system_error",
"suggestions": [
"Intenta de nuevo con un archivo diferente",
"Contacta soporte si el problema persiste"
]
}
)
async def process_import(
self,
tenant_id: str,
content: str,
file_format: str,
2025-08-12 18:17:30 +02:00
filename: Optional[str] = None
2025-08-08 09:08:41 +02:00
) -> SalesImportResult:
2025-08-12 18:17:30 +02:00
"""Enhanced data import processing with better error handling"""
2025-08-08 09:08:41 +02:00
start_time = datetime.utcnow()
try:
2025-08-14 13:26:59 +02:00
# Clear cache for new import session
self._clear_import_cache()
2025-08-12 18:17:30 +02:00
logger.info("Starting enhanced data import",
2025-08-08 09:08:41 +02:00
filename=filename,
format=file_format,
tenant_id=tenant_id)
2025-08-12 18:17:30 +02:00
async with get_db_transaction() as db:
repository = SalesRepository(db)
# Process data based on format
if file_format.lower() == 'csv':
result = await self._process_csv_data(tenant_id, content, repository, filename)
elif file_format.lower() == 'json':
result = await self._process_json_data(tenant_id, content, repository, filename)
elif file_format.lower() in ['excel', 'xlsx']:
result = await self._process_excel_data(tenant_id, content, repository, filename)
else:
raise ValueError(f"Unsupported format: {file_format}")
# Calculate processing time
end_time = datetime.utcnow()
processing_time = (end_time - start_time).total_seconds()
# Build enhanced final result
final_result = SalesImportResult(
success=result.get("success", False),
records_processed=result.get("total_rows", 0),
records_created=result.get("records_created", 0),
records_updated=0, # We don't update, only create
records_failed=result.get("total_rows", 0) - result.get("records_created", 0),
errors=self._structure_messages(result.get("errors", [])),
warnings=self._structure_messages(result.get("warnings", [])),
processing_time_seconds=processing_time
)
logger.info("Enhanced data import completed successfully",
records_created=final_result.records_created,
processing_time=processing_time)
return final_result
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-07-19 12:51:28 +02:00
end_time = datetime.utcnow()
processing_time = (end_time - start_time).total_seconds()
2025-08-12 18:17:30 +02:00
logger.error("Enhanced data import failed", error=str(e), tenant_id=tenant_id)
2025-08-08 09:08:41 +02:00
return SalesImportResult(
success=False,
records_processed=0,
records_created=0,
records_updated=0,
records_failed=0,
errors=[{
"type": "import_error",
"message": f"Import failed: {str(e)}",
2025-07-19 12:51:28 +02:00
"field": None,
2025-08-08 09:08:41 +02:00
"row": None,
"code": "IMPORT_FAILURE"
2025-07-19 12:51:28 +02:00
}],
2025-08-08 09:08:41 +02:00
warnings=[],
processing_time_seconds=processing_time
)
async def _process_csv_data(
self,
tenant_id: str,
csv_content: str,
2025-08-12 18:17:30 +02:00
repository: SalesRepository,
2025-08-08 09:08:41 +02:00
filename: Optional[str] = None
) -> Dict[str, Any]:
2025-08-12 18:17:30 +02:00
"""Enhanced CSV processing with better data handling"""
2025-07-18 11:51:43 +02:00
try:
2025-07-19 12:51:28 +02:00
reader = csv.DictReader(io.StringIO(csv_content))
rows = list(reader)
2025-07-18 11:51:43 +02:00
2025-07-19 12:51:28 +02:00
if not rows:
return {
"success": False,
"total_rows": 0,
"records_created": 0,
"errors": ["CSV file is empty"],
"warnings": []
}
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced column mapping
2025-08-08 09:08:41 +02:00
column_mapping = self._detect_columns(list(rows[0].keys()))
2025-07-18 11:51:43 +02:00
2025-07-19 12:51:28 +02:00
records_created = 0
errors = []
warnings = []
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
logger.info(f"Processing {len(rows)} records from CSV with enhanced mapping")
2025-07-18 11:51:43 +02:00
2025-07-19 12:51:28 +02:00
for index, row in enumerate(rows):
try:
2025-08-12 18:17:30 +02:00
# Enhanced data parsing and validation
2025-08-08 09:08:41 +02:00
parsed_data = await self._parse_row_data(row, column_mapping, index + 1)
if parsed_data.get("skip"):
errors.extend(parsed_data.get("errors", []))
warnings.extend(parsed_data.get("warnings", []))
2025-07-19 12:51:28 +02:00
continue
2025-08-14 13:26:59 +02:00
# Resolve product name to inventory_product_id
inventory_product_id = await self._resolve_product_to_inventory_id(
parsed_data["product_name"],
parsed_data.get("product_category"),
tenant_id
)
if not inventory_product_id:
error_msg = f"Row {index + 1}: Could not resolve product '{parsed_data['product_name']}' to inventory ID"
errors.append(error_msg)
logger.warning("Product resolution failed", error=error_msg)
continue
2025-08-12 18:17:30 +02:00
# Create sales record with enhanced data
sales_data = SalesDataCreate(
tenant_id=tenant_id,
date=parsed_data["date"],
2025-08-14 13:26:59 +02:00
inventory_product_id=inventory_product_id,
2025-08-12 18:17:30 +02:00
quantity_sold=parsed_data["quantity_sold"],
unit_price=parsed_data.get("unit_price"),
revenue=parsed_data.get("revenue"),
location_id=parsed_data.get("location_id"),
source="csv"
)
2025-07-19 12:51:28 +02:00
2025-08-12 18:17:30 +02:00
created_record = await repository.create_sales_record(sales_data, tenant_id)
2025-07-19 12:51:28 +02:00
records_created += 1
2025-08-12 18:17:30 +02:00
# Enhanced progress logging
2025-08-08 09:08:41 +02:00
if records_created % 100 == 0:
2025-08-12 18:17:30 +02:00
logger.info(f"Enhanced processing: {records_created}/{len(rows)} records completed...")
2025-08-08 09:08:41 +02:00
2025-07-19 12:51:28 +02:00
except Exception as e:
2025-08-08 09:08:41 +02:00
error_msg = f"Row {index + 1}: {str(e)}"
2025-07-19 12:51:28 +02:00
errors.append(error_msg)
2025-08-12 18:17:30 +02:00
logger.warning("Enhanced record processing failed", error=error_msg)
2025-07-18 11:51:43 +02:00
2025-07-19 12:51:28 +02:00
success_rate = (records_created / len(rows)) * 100 if rows else 0
2025-07-18 11:51:43 +02:00
2025-07-19 12:51:28 +02:00
return {
"success": records_created > 0,
"total_rows": len(rows),
"records_created": records_created,
"success_rate": success_rate,
"errors": errors,
"warnings": warnings
}
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
logger.error("Enhanced CSV processing failed", error=str(e))
raise
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
async def _process_json_data(
self,
tenant_id: str,
json_content: str,
2025-08-12 18:17:30 +02:00
repository: SalesRepository,
2025-08-08 09:08:41 +02:00
filename: Optional[str] = None
) -> Dict[str, Any]:
2025-08-12 18:17:30 +02:00
"""Enhanced JSON processing with pandas integration"""
2025-07-18 11:51:43 +02:00
try:
2025-08-12 18:17:30 +02:00
# Parse JSON with base64 support
2025-07-18 11:51:43 +02:00
if json_content.startswith('data:'):
json_content = base64.b64decode(json_content.split(',')[1]).decode('utf-8')
data = json.loads(json_content)
# Handle different JSON structures
if isinstance(data, dict):
if 'data' in data:
records = data['data']
elif 'records' in data:
records = data['records']
elif 'sales' in data:
records = data['sales']
else:
records = [data] # Single record
elif isinstance(data, list):
records = data
else:
2025-08-12 18:17:30 +02:00
raise ValueError("Invalid JSON format")
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Convert to DataFrame for enhanced processing
if records:
df = pd.DataFrame(records)
df.columns = df.columns.str.strip().str.lower()
return await self._process_dataframe(tenant_id, df, repository, "json", filename)
else:
return {
"success": False,
"total_rows": 0,
"records_created": 0,
"errors": ["No records found in JSON"],
"warnings": []
}
2025-07-18 11:51:43 +02:00
except json.JSONDecodeError as e:
2025-08-12 18:17:30 +02:00
raise ValueError(f"Invalid JSON: {str(e)}")
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
logger.error("Enhanced JSON processing failed", error=str(e))
raise
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
async def _process_excel_data(
self,
tenant_id: str,
excel_content: str,
2025-08-12 18:17:30 +02:00
repository: SalesRepository,
2025-08-08 09:08:41 +02:00
filename: Optional[str] = None
) -> Dict[str, Any]:
2025-08-12 18:17:30 +02:00
"""Enhanced Excel processing with base64 support"""
2025-07-18 11:51:43 +02:00
try:
2025-08-08 09:08:41 +02:00
# Decode base64 content
if excel_content.startswith('data:'):
excel_bytes = base64.b64decode(excel_content.split(',')[1])
else:
excel_bytes = base64.b64decode(excel_content)
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Read Excel file with pandas
2025-08-08 09:08:41 +02:00
df = pd.read_excel(io.BytesIO(excel_bytes), sheet_name=0)
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced column cleaning
2025-08-08 09:08:41 +02:00
df.columns = df.columns.str.strip().str.lower()
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
# Remove empty rows
df = df.dropna(how='all')
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
return await self._process_dataframe(tenant_id, df, repository, "excel", filename)
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
logger.error("Enhanced Excel processing failed", error=str(e))
raise
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
async def _process_dataframe(
self,
tenant_id: str,
df: pd.DataFrame,
2025-08-12 18:17:30 +02:00
repository: SalesRepository,
2025-08-08 09:08:41 +02:00
source: str,
filename: Optional[str] = None
) -> Dict[str, Any]:
2025-08-12 18:17:30 +02:00
"""Enhanced DataFrame processing with better error handling"""
2025-07-18 11:51:43 +02:00
try:
2025-08-12 18:17:30 +02:00
# Enhanced column mapping
2025-08-08 09:08:41 +02:00
column_mapping = self._detect_columns(df.columns.tolist())
if not column_mapping.get('date') or not column_mapping.get('product'):
required_missing = []
if not column_mapping.get('date'):
required_missing.append("date")
if not column_mapping.get('product'):
required_missing.append("product")
2025-08-12 18:17:30 +02:00
raise ValueError(f"Required columns missing: {', '.join(required_missing)}")
2025-08-08 09:08:41 +02:00
2025-07-18 11:51:43 +02:00
records_created = 0
errors = []
warnings = []
2025-08-12 18:17:30 +02:00
logger.info(f"Enhanced processing of {len(df)} records from {source}")
2025-07-18 11:51:43 +02:00
for index, row in df.iterrows():
try:
2025-08-08 09:08:41 +02:00
# Convert pandas row to dict
row_dict = {}
for col in df.columns:
2025-08-12 18:17:30 +02:00
val = row[col]
# Handle pandas NaN values
if pd.isna(val):
row_dict[col] = None
else:
row_dict[col] = val
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced data parsing
2025-08-08 09:08:41 +02:00
parsed_data = await self._parse_row_data(row_dict, column_mapping, index + 1)
if parsed_data.get("skip"):
errors.extend(parsed_data.get("errors", []))
warnings.extend(parsed_data.get("warnings", []))
2025-07-18 11:51:43 +02:00
continue
2025-08-14 13:26:59 +02:00
# Resolve product name to inventory_product_id
inventory_product_id = await self._resolve_product_to_inventory_id(
parsed_data["product_name"],
parsed_data.get("product_category"),
tenant_id
)
if not inventory_product_id:
error_msg = f"Row {index + 1}: Could not resolve product '{parsed_data['product_name']}' to inventory ID"
errors.append(error_msg)
logger.warning("Product resolution failed", error=error_msg)
continue
2025-08-12 18:17:30 +02:00
# Create enhanced sales record
sales_data = SalesDataCreate(
tenant_id=tenant_id,
date=parsed_data["date"],
2025-08-14 13:26:59 +02:00
inventory_product_id=inventory_product_id,
2025-08-12 18:17:30 +02:00
quantity_sold=parsed_data["quantity_sold"],
unit_price=parsed_data.get("unit_price"),
revenue=parsed_data.get("revenue"),
location_id=parsed_data.get("location_id"),
source=source
)
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
created_record = await repository.create_sales_record(sales_data, tenant_id)
2025-07-18 11:51:43 +02:00
records_created += 1
2025-08-12 18:17:30 +02:00
# Progress logging for large datasets
2025-07-18 11:51:43 +02:00
if records_created % 100 == 0:
2025-08-12 18:17:30 +02:00
logger.info(f"Enhanced DataFrame processing: {records_created}/{len(df)} records completed...")
2025-08-08 09:08:41 +02:00
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-08-08 09:08:41 +02:00
error_msg = f"Row {index + 1}: {str(e)}"
2025-07-18 11:51:43 +02:00
errors.append(error_msg)
2025-08-12 18:17:30 +02:00
logger.warning("Enhanced record processing failed", error=error_msg)
2025-07-18 11:51:43 +02:00
success_rate = (records_created / len(df)) * 100 if len(df) > 0 else 0
2025-08-08 09:08:41 +02:00
return {
"success": records_created > 0,
2025-07-18 11:51:43 +02:00
"total_rows": len(df),
2025-08-08 09:08:41 +02:00
"records_created": records_created,
"success_rate": success_rate,
2025-08-12 18:17:30 +02:00
"errors": errors[:10], # Limit errors for performance
2025-08-08 09:08:41 +02:00
"warnings": warnings[:10] # Limit warnings
2025-07-18 11:51:43 +02:00
}
2025-08-12 18:17:30 +02:00
except ValueError:
2025-08-08 09:08:41 +02:00
raise
2025-07-18 11:51:43 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
logger.error("Enhanced DataFrame processing failed", error=str(e))
raise
2025-08-08 09:08:41 +02:00
async def _parse_row_data(
self,
row: Dict[str, Any],
column_mapping: Dict[str, str],
row_number: int
) -> Dict[str, Any]:
2025-08-12 18:17:30 +02:00
"""Enhanced row data parsing with better validation"""
2025-08-08 09:08:41 +02:00
errors = []
warnings = []
try:
2025-08-12 18:17:30 +02:00
# Enhanced date extraction and validation
2025-08-08 09:08:41 +02:00
date_str = str(row.get(column_mapping.get('date', ''), '')).strip()
if not date_str or date_str.lower() in ['nan', 'null', 'none', '']:
errors.append(f"Row {row_number}: Missing date")
return {"skip": True, "errors": errors, "warnings": warnings}
parsed_date = self._parse_date(date_str)
if not parsed_date:
errors.append(f"Row {row_number}: Invalid date format: {date_str}")
return {"skip": True, "errors": errors, "warnings": warnings}
2025-08-12 18:17:30 +02:00
# Enhanced product name extraction and cleaning
2025-08-08 09:08:41 +02:00
product_name = str(row.get(column_mapping.get('product', ''), '')).strip()
if not product_name or product_name.lower() in ['nan', 'null', 'none', '']:
errors.append(f"Row {row_number}: Missing product name")
return {"skip": True, "errors": errors, "warnings": warnings}
product_name = self._clean_product_name(product_name)
2025-08-12 18:17:30 +02:00
# STRICT quantity validation for production data quality
2025-08-08 09:08:41 +02:00
quantity_raw = row.get(column_mapping.get('quantity', 'quantity'), 1)
try:
2025-08-12 18:17:30 +02:00
if pd.isna(quantity_raw):
# Allow default quantity of 1 for missing values
2025-08-08 09:08:41 +02:00
quantity = 1
2025-08-12 18:17:30 +02:00
else:
quantity = int(float(str(quantity_raw).replace(',', '.')))
if quantity <= 0:
# STRICT: Treat invalid quantities as ERRORS, not warnings
errors.append(f"Row {row_number}: Invalid quantity ({quantity}) - quantities must be positive")
return {"skip": True, "errors": errors, "warnings": warnings}
elif self.STRICT_VALIDATION and quantity > self.MAX_QUANTITY_PER_DAY:
# STRICT: Check for unrealistic quantities
errors.append(f"Row {row_number}: Unrealistic quantity ({quantity}) - exceeds maximum expected daily sales ({self.MAX_QUANTITY_PER_DAY})")
return {"skip": True, "errors": errors, "warnings": warnings}
2025-08-08 09:08:41 +02:00
except (ValueError, TypeError):
2025-08-12 18:17:30 +02:00
# STRICT: Treat non-numeric quantities as ERRORS
errors.append(f"Row {row_number}: Invalid quantity format ({quantity_raw}) - must be a positive number")
return {"skip": True, "errors": errors, "warnings": warnings}
2025-08-08 09:08:41 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced revenue extraction
2025-08-08 09:08:41 +02:00
revenue = None
2025-08-12 18:17:30 +02:00
unit_price = None
2025-08-08 09:08:41 +02:00
if 'revenue' in column_mapping and column_mapping['revenue'] in row:
revenue_raw = row.get(column_mapping['revenue'])
2025-08-12 18:17:30 +02:00
if revenue_raw and not pd.isna(revenue_raw) and str(revenue_raw).lower() not in ['nan', 'null', 'none', '']:
2025-08-08 09:08:41 +02:00
try:
revenue = float(str(revenue_raw).replace(',', '.').replace('', '').replace('$', '').strip())
if revenue < 0:
2025-08-12 18:17:30 +02:00
# STRICT: Treat negative revenue as ERROR, not warning
errors.append(f"Row {row_number}: Negative revenue ({revenue}) - revenue must be positive or zero")
return {"skip": True, "errors": errors, "warnings": warnings}
else:
# STRICT: Check for unrealistic revenue values
if self.STRICT_VALIDATION and revenue > self.MAX_REVENUE_PER_ITEM:
errors.append(f"Row {row_number}: Unrealistic revenue ({revenue}) - exceeds maximum expected value ({self.MAX_REVENUE_PER_ITEM})")
return {"skip": True, "errors": errors, "warnings": warnings}
# Calculate unit price if we have both revenue and quantity
unit_price = revenue / quantity if quantity > 0 else None
# STRICT: Validate unit price reasonableness
if unit_price and unit_price > 10000: # More than €10,000 per unit seems unrealistic for bakery
errors.append(f"Row {row_number}: Unrealistic unit price ({unit_price:.2f}) - check quantity and revenue values")
return {"skip": True, "errors": errors, "warnings": warnings}
2025-08-08 09:08:41 +02:00
except (ValueError, TypeError):
2025-08-12 18:17:30 +02:00
# STRICT: Treat invalid revenue format as ERROR
errors.append(f"Row {row_number}: Invalid revenue format ({revenue_raw}) - must be a valid number")
return {"skip": True, "errors": errors, "warnings": warnings}
2025-08-08 09:08:41 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced location extraction
2025-08-08 09:08:41 +02:00
location_id = None
if 'location' in column_mapping and column_mapping['location'] in row:
location_raw = row.get(column_mapping['location'])
2025-08-12 18:17:30 +02:00
if location_raw and not pd.isna(location_raw) and str(location_raw).lower() not in ['nan', 'null', 'none', '']:
2025-08-08 09:08:41 +02:00
location_id = str(location_raw).strip()
2025-08-12 18:17:30 +02:00
# Enhanced product category extraction
product_category = None
if 'category' in column_mapping and column_mapping['category'] in row:
category_raw = row.get(column_mapping['category'])
if category_raw and not pd.isna(category_raw) and str(category_raw).lower() not in ['nan', 'null', 'none', '']:
product_category = str(category_raw).strip()
2025-07-18 11:51:43 +02:00
return {
2025-08-08 09:08:41 +02:00
"skip": False,
"date": parsed_date,
"product_name": product_name,
2025-08-12 18:17:30 +02:00
"product_category": product_category,
2025-08-08 09:08:41 +02:00
"quantity_sold": quantity,
2025-08-12 18:17:30 +02:00
"unit_price": unit_price,
2025-08-08 09:08:41 +02:00
"revenue": revenue,
"location_id": location_id,
"errors": errors,
"warnings": warnings
2025-07-18 11:51:43 +02:00
}
2025-08-08 09:08:41 +02:00
except Exception as e:
2025-08-12 18:17:30 +02:00
errors.append(f"Row {row_number}: Enhanced parsing error: {str(e)}")
2025-08-08 09:08:41 +02:00
return {"skip": True, "errors": errors, "warnings": warnings}
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
def _detect_columns(self, columns: List[str]) -> Dict[str, str]:
2025-08-12 18:17:30 +02:00
"""Enhanced column detection with fuzzy matching"""
2025-07-18 11:51:43 +02:00
mapping = {}
2025-08-12 18:17:30 +02:00
columns_lower = [col.lower().strip() for col in columns]
2025-07-18 11:51:43 +02:00
2025-08-08 09:08:41 +02:00
for standard_name, possible_names in self.COLUMN_MAPPINGS.items():
2025-08-12 18:17:30 +02:00
best_match = None
best_score = 0
for col_idx, col in enumerate(columns_lower):
2025-07-18 11:51:43 +02:00
for possible in possible_names:
2025-08-12 18:17:30 +02:00
# Exact match (highest priority)
if possible == col:
best_match = columns[col_idx]
best_score = 100
2025-07-18 11:51:43 +02:00
break
2025-08-12 18:17:30 +02:00
# Contains match
elif possible in col or col in possible:
score = len(possible) / len(col) * 90
if score > best_score:
best_match = columns[col_idx]
best_score = score
if best_score == 100: # Found exact match
2025-07-18 11:51:43 +02:00
break
2025-08-12 18:17:30 +02:00
if best_match and best_score > 70: # Threshold for matches
mapping[standard_name] = best_match
2025-07-18 11:51:43 +02:00
2025-08-12 18:17:30 +02:00
# Enhanced alias mapping
2025-07-18 11:51:43 +02:00
if 'product' not in mapping and 'product_name' in mapping:
mapping['product'] = mapping['product_name']
if 'quantity' not in mapping and 'quantity_sold' in mapping:
mapping['quantity'] = mapping['quantity_sold']
if 'location' not in mapping and 'location_id' in mapping:
mapping['location'] = mapping['location_id']
return mapping
2025-08-08 09:08:41 +02:00
def _parse_date(self, date_str: str) -> Optional[datetime]:
2025-08-12 18:17:30 +02:00
"""Enhanced date parsing with pandas and multiple format support"""
2025-07-18 11:51:43 +02:00
if not date_str or str(date_str).lower() in ['nan', 'null', 'none']:
return None
date_str = str(date_str).strip()
2025-08-12 18:17:30 +02:00
# Try pandas first (most robust)
2025-07-18 11:51:43 +02:00
try:
2025-07-27 10:01:37 +02:00
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
if hasattr(parsed_dt, 'to_pydatetime'):
parsed_dt = parsed_dt.to_pydatetime()
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
except Exception:
2025-07-18 11:51:43 +02:00
pass
2025-08-12 18:17:30 +02:00
# Try specific formats as fallback
2025-08-08 09:08:41 +02:00
for fmt in self.DATE_FORMATS:
2025-07-18 11:51:43 +02:00
try:
2025-07-27 10:01:37 +02:00
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
2025-07-18 11:51:43 +02:00
except ValueError:
continue
2025-08-12 18:17:30 +02:00
logger.warning(f"Could not parse date: {date_str}")
2025-07-18 11:51:43 +02:00
return None
2025-08-08 09:08:41 +02:00
def _clean_product_name(self, product_name: str) -> str:
2025-08-12 18:17:30 +02:00
"""Enhanced product name cleaning and standardization"""
2025-07-18 11:51:43 +02:00
if not product_name:
return "Producto sin nombre"
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', str(product_name).strip())
# Remove special characters but keep Spanish characters
cleaned = re.sub(r'[^\w\s\-áéíóúñçüÁÉÍÓÚÑÇÜ]', '', cleaned)
# Capitalize first letter of each word
cleaned = cleaned.title()
2025-08-12 18:17:30 +02:00
# Enhanced corrections for Spanish bakeries
2025-07-18 11:51:43 +02:00
replacements = {
'Pan De': 'Pan de',
'Café Con': 'Café con',
'Te ': '',
'Bocadillo De': 'Bocadillo de',
2025-08-12 18:17:30 +02:00
'Dulce De': 'Dulce de',
'Tarta De': 'Tarta de',
2025-07-18 11:51:43 +02:00
}
for old, new in replacements.items():
cleaned = cleaned.replace(old, new)
return cleaned if cleaned else "Producto sin nombre"
2025-08-14 13:26:59 +02:00
def _clear_import_cache(self):
"""Clear the product resolution cache for a new import session"""
self.product_cache.clear()
self.failed_products.clear()
logger.info("Import cache cleared for new session")
async def _resolve_product_to_inventory_id(self, product_name: str, product_category: Optional[str], tenant_id: UUID) -> Optional[UUID]:
"""Resolve a product name to an inventory_product_id via the inventory service with caching and rate limiting"""
# Check cache first
if product_name in self.product_cache:
logger.debug("Product resolved from cache", product_name=product_name, tenant_id=tenant_id)
return self.product_cache[product_name]
# Skip if this product already failed to resolve
if product_name in self.failed_products:
logger.debug("Skipping previously failed product", product_name=product_name, tenant_id=tenant_id)
return None
max_retries = 3
base_delay = 1.0 # Start with 1 second delay
for attempt in range(max_retries):
try:
# Add delay before API calls to avoid rate limiting
if attempt > 0:
delay = base_delay * (2 ** (attempt - 1)) # Exponential backoff
logger.info(f"Retrying product resolution after {delay}s delay",
product_name=product_name, attempt=attempt, tenant_id=tenant_id)
await asyncio.sleep(delay)
# First try to search for existing product by name
products = await self.inventory_client.search_products(product_name, tenant_id)
if products:
# Return the first matching product's ID
product_id = products[0].get('id')
if product_id:
uuid_id = UUID(str(product_id))
self.product_cache[product_name] = uuid_id # Cache for future use
logger.info("Resolved product to existing inventory ID",
product_name=product_name, product_id=product_id, tenant_id=tenant_id)
return uuid_id
# Add small delay before creation attempt to avoid hitting rate limits
await asyncio.sleep(0.5)
# If not found, create a new ingredient/product in inventory
ingredient_data = {
'name': product_name,
'type': 'finished_product', # Assuming sales are of finished products
'unit': 'unit', # Default unit
'current_stock': 0, # No stock initially
'reorder_point': 0,
'cost_per_unit': 0,
'category': product_category or 'general'
}
created_product = await self.inventory_client.create_ingredient(ingredient_data, str(tenant_id))
if created_product and created_product.get('id'):
product_id = created_product['id']
uuid_id = UUID(str(product_id))
self.product_cache[product_name] = uuid_id # Cache for future use
logger.info("Created new inventory product for sales data",
product_name=product_name, product_id=product_id, tenant_id=tenant_id)
return uuid_id
logger.warning("Failed to resolve or create product in inventory",
product_name=product_name, tenant_id=tenant_id, attempt=attempt)
except Exception as e:
error_str = str(e)
if "429" in error_str or "rate limit" in error_str.lower():
logger.warning("Rate limit hit, retrying",
product_name=product_name, attempt=attempt, error=error_str, tenant_id=tenant_id)
if attempt < max_retries - 1:
continue # Retry with exponential backoff
else:
logger.error("Error resolving product to inventory ID",
error=error_str, product_name=product_name, tenant_id=tenant_id)
break # Don't retry for non-rate-limit errors
# If all retries failed, mark as failed and return None
self.failed_products.add(product_name)
logger.error("Failed to resolve product after all retries",
product_name=product_name, tenant_id=tenant_id)
return None
2025-08-08 09:08:41 +02:00
def _structure_messages(self, messages: List[Union[str, Dict]]) -> List[Dict[str, Any]]:
"""Convert string messages to structured format"""
structured = []
for msg in messages:
if isinstance(msg, str):
structured.append({
"type": "general_message",
2025-07-23 18:57:27 +02:00
"message": msg,
"field": None,
"row": None,
2025-08-08 09:08:41 +02:00
"code": "GENERAL_MESSAGE"
})
2025-07-18 11:51:43 +02:00
else:
2025-08-08 09:08:41 +02:00
structured.append(msg)
return structured
def _generate_suggestions(
self,
validation_result: SalesValidationResult,
format_type: str,
warning_count: int
) -> List[str]:
2025-08-12 18:17:30 +02:00
"""Generate enhanced contextual suggestions"""
2025-08-08 09:08:41 +02:00
suggestions = []
if validation_result.is_valid:
suggestions.append("El archivo está listo para procesamiento")
suggestions.append(f"Se procesarán aproximadamente {validation_result.total_records} registros")
2025-07-23 18:57:27 +02:00
2025-08-08 09:08:41 +02:00
if validation_result.total_records > 1000:
suggestions.append("Archivo grande: el procesamiento puede tomar varios minutos")
2025-08-12 18:17:30 +02:00
suggestions.append("Considera dividir archivos muy grandes para mejor rendimiento")
2025-07-23 18:57:27 +02:00
2025-08-08 09:08:41 +02:00
if warning_count > 0:
suggestions.append("Revisa las advertencias antes de continuar")
2025-08-12 18:17:30 +02:00
suggestions.append("Los datos con advertencias se procesarán con valores por defecto")
# Format-specific suggestions
if format_type == "csv":
suggestions.append("Asegúrate de que las fechas estén en formato DD/MM/YYYY")
suggestions.append("Verifica que los números usen punto decimal (no coma)")
elif format_type in ["excel", "xlsx"]:
suggestions.append("Solo se procesará la primera hoja del archivo Excel")
suggestions.append("Evita celdas combinadas y fórmulas complejas")
2025-08-08 09:08:41 +02:00
else:
suggestions.append("Corrige los errores antes de continuar")
suggestions.append("Verifica que el archivo tenga el formato correcto")
2025-07-23 18:57:27 +02:00
2025-08-08 09:08:41 +02:00
if format_type not in ["csv", "excel", "xlsx", "json"]:
2025-08-12 18:17:30 +02:00
suggestions.append("Usa formato CSV o Excel para mejores resultados")
suggestions.append("El formato JSON es para usuarios avanzados")
2025-08-08 09:08:41 +02:00
if validation_result.total_records == 0:
suggestions.append("Asegúrate de que el archivo contenga datos")
2025-08-12 18:17:30 +02:00
suggestions.append("Verifica que el archivo no esté corrupto")
# Missing column suggestions
error_codes = [error.get("code", "") for error in validation_result.errors if isinstance(error, dict)]
if "MISSING_DATE_COLUMN" in error_codes:
suggestions.append("Incluye una columna de fecha (fecha, date, dia)")
if "MISSING_PRODUCT_COLUMN" in error_codes:
suggestions.append("Incluye una columna de producto (producto, product, item)")
2025-07-19 12:51:28 +02:00
2025-08-08 09:08:41 +02:00
return suggestions
2025-08-12 18:17:30 +02:00
# Main DataImportService class with enhanced functionality