REFACTOR data service

2025-08-12 18:17:30 +02:00
parent 7c237c0acc
commit fbe7470ad9
149 changed files with 8528 additions and 7393 deletions
--- a/services/sales/app/services/data_import_service.py
+++ b/services/sales/app/services/data_import_service.py
@@ -0,0 +1,943 @@
+# services/sales/app/services/data_import_service.py
+"""
+Data Import Service
+Service for importing sales data using repository pattern and enhanced error handling
+"""
+
+import csv
+import io
+import json
+import base64
+import pandas as pd
+from typing import Dict, Any, List, Optional, Union
+from datetime import datetime, timezone
+import structlog
+import re
+
+from app.repositories.sales_repository import SalesRepository
+from app.models.sales import SalesData
+from app.schemas.sales import SalesDataCreate
+from app.core.database import get_db_transaction
+
+logger = structlog.get_logger()
+
+
+# Import result schemas (dataclass definitions)
+from dataclasses import dataclass
+from typing import List, Dict, Any
+
+@dataclass
+class SalesValidationResult:
+    is_valid: bool
+    total_records: int
+    valid_records: int
+    invalid_records: int
+    errors: List[Dict[str, Any]]
+    warnings: List[Dict[str, Any]]
+    summary: Dict[str, Any]
+
+@dataclass
+class SalesImportResult:
+    success: bool
+    records_processed: int
+    records_created: int
+    records_updated: int
+    records_failed: int
+    errors: List[Dict[str, Any]]
+    warnings: List[Dict[str, Any]]
+    processing_time_seconds: float
+
+
+class DataImportService:
+    """Enhanced data import service using repository pattern with STRICT validation for production"""
+    
+    # PRODUCTION VALIDATION CONFIGURATION
+    STRICT_VALIDATION = True  # Set to False for lenient validation, True for production quality
+    MAX_QUANTITY_PER_DAY = 10000  # Maximum reasonable quantity per product per day
+    MAX_REVENUE_PER_ITEM = 100000  # Maximum reasonable revenue per line item  
+    MAX_UNIT_PRICE = 10000  # Maximum reasonable price per unit for bakery items
+    
+    # Common column mappings for different languages/formats
+    COLUMN_MAPPINGS = {
+        'date': ['date', 'fecha', 'datum', 'data', 'dia'],
+        'datetime': ['datetime', 'fecha_hora', 'timestamp'],
+        'product': ['product', 'producto', 'item', 'articulo', 'nombre', 'name'],
+        'product_name': ['product_name', 'nombre_producto', 'item_name'],
+        'quantity': ['quantity', 'cantidad', 'qty', 'units', 'unidades'],
+        'quantity_sold': ['quantity_sold', 'cantidad_vendida', 'sold'],
+        'revenue': ['revenue', 'ingresos', 'sales', 'ventas', 'total', 'importe'],
+        'price': ['price', 'precio', 'cost', 'coste'],
+        'location': ['location', 'ubicacion', 'tienda', 'store', 'punto_venta'],
+        'location_id': ['location_id', 'store_id', 'tienda_id'],
+    }
+    
+    DATE_FORMATS = [
+        '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%m-%d-%Y',
+        '%d.%m.%Y', '%Y/%m/%d', '%d/%m/%y', '%m/%d/%y',
+        '%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M',
+    ]
+    
+    def __init__(self):
+        """Initialize enhanced import service"""
+        pass
+    
+    async def validate_import_data(self, data: Dict[str, Any]) -> SalesValidationResult:
+        """Enhanced validation with better error handling and suggestions"""
+        try:
+            logger.info("Starting enhanced import data validation", tenant_id=data.get("tenant_id"))
+            
+            validation_result = SalesValidationResult(
+                is_valid=True,
+                total_records=0,
+                valid_records=0,
+                invalid_records=0,
+                errors=[],
+                warnings=[],
+                summary={}
+            )
+            
+            errors = []
+            warnings = []
+            
+            # Basic validation checks
+            if not data.get("tenant_id"):
+                errors.append({
+                    "type": "missing_field",
+                    "message": "tenant_id es requerido",
+                    "field": "tenant_id",
+                    "row": None,
+                    "code": "MISSING_TENANT_ID"
+                })
+            
+            if not data.get("data"):
+                errors.append({
+                    "type": "missing_data",
+                    "message": "Datos de archivo faltantes",
+                    "field": "data",
+                    "row": None,
+                    "code": "NO_DATA_PROVIDED"
+                })
+                
+                validation_result.is_valid = False
+                validation_result.errors = errors
+                validation_result.summary = {
+                    "status": "failed",
+                    "reason": "no_data_provided",
+                    "file_format": data.get("data_format", "unknown"),
+                    "suggestions": ["Selecciona un archivo válido para importar"]
+                }
+                return validation_result
+            
+            # Validate file format
+            format_type = data.get("data_format", "").lower()
+            supported_formats = ["csv", "excel", "xlsx", "xls", "json", "pos"]
+            
+            if format_type not in supported_formats:
+                errors.append({
+                    "type": "unsupported_format",
+                    "message": f"Formato no soportado: {format_type}",
+                    "field": "data_format",
+                    "row": None,
+                    "code": "UNSUPPORTED_FORMAT"
+                })
+            
+            # Validate data size
+            data_content = data.get("data", "")
+            data_size = len(data_content)
+            
+            if data_size == 0:
+                errors.append({
+                    "type": "empty_file",
+                    "message": "El archivo está vacío",
+                    "field": "data",
+                    "row": None,
+                    "code": "EMPTY_FILE"
+                })
+            elif data_size > 10 * 1024 * 1024:  # 10MB limit
+                errors.append({
+                    "type": "file_too_large",
+                    "message": "Archivo demasiado grande (máximo 10MB)",
+                    "field": "data",
+                    "row": None,
+                    "code": "FILE_TOO_LARGE"
+                })
+            elif data_size > 1024 * 1024:  # 1MB warning
+                warnings.append({
+                    "type": "large_file",
+                    "message": "Archivo grande detectado. El procesamiento puede tomar más tiempo.",
+                    "field": "data",
+                    "row": None,
+                    "code": "LARGE_FILE_WARNING"
+                })
+            
+            # Analyze CSV content if format is CSV
+            if format_type == "csv" and data_content and not errors:
+                try:
+                    reader = csv.DictReader(io.StringIO(data_content))
+                    rows = list(reader)
+                    
+                    validation_result.total_records = len(rows)
+                    
+                    if not rows:
+                        errors.append({
+                            "type": "empty_content",
+                            "message": "El archivo CSV no contiene datos",
+                            "field": "data",
+                            "row": None,
+                            "code": "NO_CONTENT"
+                        })
+                    else:
+                        # Enhanced column analysis
+                        headers = list(rows[0].keys()) if rows else []
+                        column_mapping = self._detect_columns(headers)
+                        
+                        # Check for required columns
+                        if not column_mapping.get('date'):
+                            errors.append({
+                                "type": "missing_column",
+                                "message": "Columna de fecha no encontrada",
+                                "field": "date",
+                                "row": None,
+                                "code": "MISSING_DATE_COLUMN"
+                            })
+                        
+                        if not column_mapping.get('product'):
+                            errors.append({
+                                "type": "missing_column",
+                                "message": "Columna de producto no encontrada",
+                                "field": "product",
+                                "row": None,
+                                "code": "MISSING_PRODUCT_COLUMN"
+                            })
+                        
+                        if not column_mapping.get('quantity'):
+                            warnings.append({
+                                "type": "missing_column",
+                                "message": "Columna de cantidad no encontrada, se usará 1 por defecto",
+                                "field": "quantity",
+                                "row": None,
+                                "code": "MISSING_QUANTITY_COLUMN"
+                            })
+                        
+                        # Enhanced data quality estimation
+                        if not errors:
+                            sample_size = min(10, len(rows))
+                            sample_rows = rows[:sample_size]
+                            quality_issues = 0
+                            
+                            for i, row in enumerate(sample_rows):
+                                parsed_data = await self._parse_row_data(row, column_mapping, i + 1)
+                                if parsed_data.get("skip") or parsed_data.get("errors"):
+                                    quality_issues += 1
+                            
+                            estimated_error_rate = (quality_issues / sample_size) * 100 if sample_size > 0 else 0
+                            estimated_invalid = int(validation_result.total_records * estimated_error_rate / 100)
+                            
+                            validation_result.valid_records = validation_result.total_records - estimated_invalid
+                            validation_result.invalid_records = estimated_invalid
+                            
+                            # STRICT: Any data quality issues should fail validation for production
+                            if estimated_error_rate > 0:
+                                errors.append({
+                                    "type": "data_quality_error",
+                                    "message": f"Falló la validación de calidad: {estimated_error_rate:.0f}% de los datos tienen errores críticos",
+                                    "field": "data", 
+                                    "row": None,
+                                    "code": "DATA_QUALITY_FAILED"
+                                })
+                                
+                                # Add specific error details
+                                if estimated_error_rate > 50:
+                                    errors.append({
+                                        "type": "data_quality_critical",
+                                        "message": f"Calidad de datos crítica: más del 50% de los registros tienen errores",
+                                        "field": "data",
+                                        "row": None, 
+                                        "code": "DATA_QUALITY_CRITICAL"
+                                    })
+                                elif estimated_error_rate > 20:
+                                    errors.append({
+                                        "type": "data_quality_high",
+                                        "message": f"Alta tasa de errores detectada: {estimated_error_rate:.0f}% de los datos requieren corrección",
+                                        "field": "data",
+                                        "row": None,
+                                        "code": "DATA_QUALITY_HIGH_ERROR_RATE" 
+                                    })
+                                else:
+                                    # Even small error rates are now treated as validation failures
+                                    errors.append({
+                                        "type": "data_quality_detected",
+                                        "message": f"Se detectaron errores de validación en {estimated_error_rate:.0f}% de los datos",
+                                        "field": "data",
+                                        "row": None,
+                                        "code": "DATA_QUALITY_ERRORS_FOUND"
+                                    })
+                        else:
+                            validation_result.valid_records = 0
+                            validation_result.invalid_records = validation_result.total_records
+                            
+                except Exception as csv_error:
+                    logger.warning("Enhanced CSV analysis failed", error=str(csv_error))
+                    warnings.append({
+                        "type": "analysis_warning",
+                        "message": f"No se pudo analizar completamente el CSV: {str(csv_error)}",
+                        "field": "data",
+                        "row": None,
+                        "code": "CSV_ANALYSIS_WARNING"
+                    })
+            
+            # Set validation result
+            validation_result.is_valid = len(errors) == 0
+            validation_result.errors = errors
+            validation_result.warnings = warnings
+            
+            # Enhanced summary generation
+            validation_result.summary = {
+                "status": "valid" if validation_result.is_valid else "invalid",
+                "file_format": format_type,
+                "file_size_bytes": data_size,
+                "file_size_mb": round(data_size / (1024 * 1024), 2),
+                "estimated_processing_time_seconds": max(1, validation_result.total_records // 100),
+                "validation_timestamp": datetime.utcnow().isoformat(),
+                "detected_columns": list(self._detect_columns(list(csv.DictReader(io.StringIO(data_content)).fieldnames or [])).keys()) if format_type == "csv" and data_content else [],
+                "suggestions": self._generate_suggestions(validation_result, format_type, len(warnings))
+            }
+            
+            logger.info("Enhanced import validation completed",
+                       is_valid=validation_result.is_valid,
+                       total_records=validation_result.total_records,
+                       error_count=len(errors),
+                       warning_count=len(warnings))
+            
+            return validation_result
+            
+        except Exception as e:
+            logger.error("Enhanced validation process failed", error=str(e))
+            
+            return SalesValidationResult(
+                is_valid=False,
+                total_records=0,
+                valid_records=0,
+                invalid_records=0,
+                errors=[{
+                    "type": "system_error",
+                    "message": f"Error en el proceso de validación: {str(e)}",
+                    "field": None,
+                    "row": None,
+                    "code": "SYSTEM_ERROR"
+                }],
+                warnings=[],
+                summary={
+                    "status": "error",
+                    "file_format": data.get("data_format", "unknown"),
+                    "error_type": "system_error",
+                    "suggestions": [
+                        "Intenta de nuevo con un archivo diferente",
+                        "Contacta soporte si el problema persiste"
+                    ]
+                }
+            )
+    
+    async def process_import(
+        self, 
+        tenant_id: str, 
+        content: str, 
+        file_format: str, 
+        filename: Optional[str] = None
+    ) -> SalesImportResult:
+        """Enhanced data import processing with better error handling"""
+        start_time = datetime.utcnow()
+        
+        try:
+            logger.info("Starting enhanced data import",
+                       filename=filename,
+                       format=file_format,
+                       tenant_id=tenant_id)
+            
+            async with get_db_transaction() as db:
+                repository = SalesRepository(db)
+                
+                # Process data based on format
+                if file_format.lower() == 'csv':
+                    result = await self._process_csv_data(tenant_id, content, repository, filename)
+                elif file_format.lower() == 'json':
+                    result = await self._process_json_data(tenant_id, content, repository, filename)
+                elif file_format.lower() in ['excel', 'xlsx']:
+                    result = await self._process_excel_data(tenant_id, content, repository, filename)
+                else:
+                    raise ValueError(f"Unsupported format: {file_format}")
+                
+                # Calculate processing time
+                end_time = datetime.utcnow()
+                processing_time = (end_time - start_time).total_seconds()
+                
+                # Build enhanced final result
+                final_result = SalesImportResult(
+                    success=result.get("success", False),
+                    records_processed=result.get("total_rows", 0),
+                    records_created=result.get("records_created", 0),
+                    records_updated=0,  # We don't update, only create
+                    records_failed=result.get("total_rows", 0) - result.get("records_created", 0),
+                    errors=self._structure_messages(result.get("errors", [])),
+                    warnings=self._structure_messages(result.get("warnings", [])),
+                    processing_time_seconds=processing_time
+                )
+                
+                logger.info("Enhanced data import completed successfully",
+                           records_created=final_result.records_created,
+                           processing_time=processing_time)
+                
+                return final_result
+                
+        except Exception as e:
+            end_time = datetime.utcnow()
+            processing_time = (end_time - start_time).total_seconds()
+            
+            logger.error("Enhanced data import failed", error=str(e), tenant_id=tenant_id)
+            
+            return SalesImportResult(
+                success=False,
+                records_processed=0,
+                records_created=0,
+                records_updated=0,
+                records_failed=0,
+                errors=[{
+                    "type": "import_error",
+                    "message": f"Import failed: {str(e)}",
+                    "field": None,
+                    "row": None,
+                    "code": "IMPORT_FAILURE"
+                }],
+                warnings=[],
+                processing_time_seconds=processing_time
+            )
+    
+    async def _process_csv_data(
+        self, 
+        tenant_id: str, 
+        csv_content: str, 
+        repository: SalesRepository, 
+        filename: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Enhanced CSV processing with better data handling"""
+        try:
+            reader = csv.DictReader(io.StringIO(csv_content))
+            rows = list(reader)
+            
+            if not rows:
+                return {
+                    "success": False,
+                    "total_rows": 0,
+                    "records_created": 0,
+                    "errors": ["CSV file is empty"],
+                    "warnings": []
+                }
+            
+            # Enhanced column mapping
+            column_mapping = self._detect_columns(list(rows[0].keys()))
+            
+            records_created = 0
+            errors = []
+            warnings = []
+            
+            logger.info(f"Processing {len(rows)} records from CSV with enhanced mapping")
+            
+            for index, row in enumerate(rows):
+                try:
+                    # Enhanced data parsing and validation
+                    parsed_data = await self._parse_row_data(row, column_mapping, index + 1)
+                    if parsed_data.get("skip"):
+                        errors.extend(parsed_data.get("errors", []))
+                        warnings.extend(parsed_data.get("warnings", []))
+                        continue
+                    
+                    # Create sales record with enhanced data
+                    sales_data = SalesDataCreate(
+                        tenant_id=tenant_id,
+                        date=parsed_data["date"],
+                        product_name=parsed_data["product_name"],
+                        product_category=parsed_data.get("product_category"),
+                        quantity_sold=parsed_data["quantity_sold"],
+                        unit_price=parsed_data.get("unit_price"),
+                        revenue=parsed_data.get("revenue"),
+                        location_id=parsed_data.get("location_id"),
+                        source="csv"
+                    )
+                    
+                    created_record = await repository.create_sales_record(sales_data, tenant_id)
+                    records_created += 1
+                    
+                    # Enhanced progress logging
+                    if records_created % 100 == 0:
+                        logger.info(f"Enhanced processing: {records_created}/{len(rows)} records completed...")
+                        
+                except Exception as e:
+                    error_msg = f"Row {index + 1}: {str(e)}"
+                    errors.append(error_msg)
+                    logger.warning("Enhanced record processing failed", error=error_msg)
+            
+            success_rate = (records_created / len(rows)) * 100 if rows else 0
+            
+            return {
+                "success": records_created > 0,
+                "total_rows": len(rows),
+                "records_created": records_created,
+                "success_rate": success_rate,
+                "errors": errors,
+                "warnings": warnings
+            }
+            
+        except Exception as e:
+            logger.error("Enhanced CSV processing failed", error=str(e))
+            raise
+    
+    async def _process_json_data(
+        self, 
+        tenant_id: str, 
+        json_content: str, 
+        repository: SalesRepository, 
+        filename: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Enhanced JSON processing with pandas integration"""
+        try:
+            # Parse JSON with base64 support
+            if json_content.startswith('data:'):
+                json_content = base64.b64decode(json_content.split(',')[1]).decode('utf-8')
+            
+            data = json.loads(json_content)
+            
+            # Handle different JSON structures
+            if isinstance(data, dict):
+                if 'data' in data:
+                    records = data['data']
+                elif 'records' in data:
+                    records = data['records']
+                elif 'sales' in data:
+                    records = data['sales']
+                else:
+                    records = [data]  # Single record
+            elif isinstance(data, list):
+                records = data
+            else:
+                raise ValueError("Invalid JSON format")
+            
+            # Convert to DataFrame for enhanced processing
+            if records:
+                df = pd.DataFrame(records)
+                df.columns = df.columns.str.strip().str.lower()
+                
+                return await self._process_dataframe(tenant_id, df, repository, "json", filename)
+            else:
+                return {
+                    "success": False,
+                    "total_rows": 0,
+                    "records_created": 0,
+                    "errors": ["No records found in JSON"],
+                    "warnings": []
+                }
+            
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON: {str(e)}")
+        except Exception as e:
+            logger.error("Enhanced JSON processing failed", error=str(e))
+            raise
+    
+    async def _process_excel_data(
+        self, 
+        tenant_id: str, 
+        excel_content: str, 
+        repository: SalesRepository, 
+        filename: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Enhanced Excel processing with base64 support"""
+        try:
+            # Decode base64 content
+            if excel_content.startswith('data:'):
+                excel_bytes = base64.b64decode(excel_content.split(',')[1])
+            else:
+                excel_bytes = base64.b64decode(excel_content)
+            
+            # Read Excel file with pandas
+            df = pd.read_excel(io.BytesIO(excel_bytes), sheet_name=0)
+            
+            # Enhanced column cleaning
+            df.columns = df.columns.str.strip().str.lower()
+            
+            # Remove empty rows
+            df = df.dropna(how='all')
+            
+            return await self._process_dataframe(tenant_id, df, repository, "excel", filename)
+            
+        except Exception as e:
+            logger.error("Enhanced Excel processing failed", error=str(e))
+            raise
+    
+    async def _process_dataframe(
+        self, 
+        tenant_id: str, 
+        df: pd.DataFrame, 
+        repository: SalesRepository,
+        source: str,
+        filename: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Enhanced DataFrame processing with better error handling"""
+        try:
+            # Enhanced column mapping
+            column_mapping = self._detect_columns(df.columns.tolist())
+            
+            if not column_mapping.get('date') or not column_mapping.get('product'):
+                required_missing = []
+                if not column_mapping.get('date'):
+                    required_missing.append("date")
+                if not column_mapping.get('product'):
+                    required_missing.append("product")
+                
+                raise ValueError(f"Required columns missing: {', '.join(required_missing)}")
+            
+            records_created = 0
+            errors = []
+            warnings = []
+            
+            logger.info(f"Enhanced processing of {len(df)} records from {source}")
+            
+            for index, row in df.iterrows():
+                try:
+                    # Convert pandas row to dict
+                    row_dict = {}
+                    for col in df.columns:
+                        val = row[col]
+                        # Handle pandas NaN values
+                        if pd.isna(val):
+                            row_dict[col] = None
+                        else:
+                            row_dict[col] = val
+                    
+                    # Enhanced data parsing
+                    parsed_data = await self._parse_row_data(row_dict, column_mapping, index + 1)
+                    if parsed_data.get("skip"):
+                        errors.extend(parsed_data.get("errors", []))
+                        warnings.extend(parsed_data.get("warnings", []))
+                        continue
+                    
+                    # Create enhanced sales record
+                    sales_data = SalesDataCreate(
+                        tenant_id=tenant_id,
+                        date=parsed_data["date"],
+                        product_name=parsed_data["product_name"],
+                        product_category=parsed_data.get("product_category"),
+                        quantity_sold=parsed_data["quantity_sold"],
+                        unit_price=parsed_data.get("unit_price"),
+                        revenue=parsed_data.get("revenue"),
+                        location_id=parsed_data.get("location_id"),
+                        source=source
+                    )
+                    
+                    created_record = await repository.create_sales_record(sales_data, tenant_id)
+                    records_created += 1
+                    
+                    # Progress logging for large datasets
+                    if records_created % 100 == 0:
+                        logger.info(f"Enhanced DataFrame processing: {records_created}/{len(df)} records completed...")
+                        
+                except Exception as e:
+                    error_msg = f"Row {index + 1}: {str(e)}"
+                    errors.append(error_msg)
+                    logger.warning("Enhanced record processing failed", error=error_msg)
+            
+            success_rate = (records_created / len(df)) * 100 if len(df) > 0 else 0
+            
+            return {
+                "success": records_created > 0,
+                "total_rows": len(df),
+                "records_created": records_created,
+                "success_rate": success_rate,
+                "errors": errors[:10],  # Limit errors for performance
+                "warnings": warnings[:10]  # Limit warnings
+            }
+            
+        except ValueError:
+            raise
+        except Exception as e:
+            logger.error("Enhanced DataFrame processing failed", error=str(e))
+            raise
+    
+    async def _parse_row_data(
+        self, 
+        row: Dict[str, Any], 
+        column_mapping: Dict[str, str], 
+        row_number: int
+    ) -> Dict[str, Any]:
+        """Enhanced row data parsing with better validation"""
+        errors = []
+        warnings = []
+        
+        try:
+            # Enhanced date extraction and validation
+            date_str = str(row.get(column_mapping.get('date', ''), '')).strip()
+            if not date_str or date_str.lower() in ['nan', 'null', 'none', '']:
+                errors.append(f"Row {row_number}: Missing date")
+                return {"skip": True, "errors": errors, "warnings": warnings}
+            
+            parsed_date = self._parse_date(date_str)
+            if not parsed_date:
+                errors.append(f"Row {row_number}: Invalid date format: {date_str}")
+                return {"skip": True, "errors": errors, "warnings": warnings}
+            
+            # Enhanced product name extraction and cleaning
+            product_name = str(row.get(column_mapping.get('product', ''), '')).strip()
+            if not product_name or product_name.lower() in ['nan', 'null', 'none', '']:
+                errors.append(f"Row {row_number}: Missing product name")
+                return {"skip": True, "errors": errors, "warnings": warnings}
+            
+            product_name = self._clean_product_name(product_name)
+            
+            # STRICT quantity validation for production data quality
+            quantity_raw = row.get(column_mapping.get('quantity', 'quantity'), 1)
+            try:
+                if pd.isna(quantity_raw):
+                    # Allow default quantity of 1 for missing values
+                    quantity = 1
+                else:
+                    quantity = int(float(str(quantity_raw).replace(',', '.')))
+                    if quantity <= 0:
+                        # STRICT: Treat invalid quantities as ERRORS, not warnings
+                        errors.append(f"Row {row_number}: Invalid quantity ({quantity}) - quantities must be positive")
+                        return {"skip": True, "errors": errors, "warnings": warnings}
+                    elif self.STRICT_VALIDATION and quantity > self.MAX_QUANTITY_PER_DAY:
+                        # STRICT: Check for unrealistic quantities 
+                        errors.append(f"Row {row_number}: Unrealistic quantity ({quantity}) - exceeds maximum expected daily sales ({self.MAX_QUANTITY_PER_DAY})")
+                        return {"skip": True, "errors": errors, "warnings": warnings}
+            except (ValueError, TypeError):
+                # STRICT: Treat non-numeric quantities as ERRORS
+                errors.append(f"Row {row_number}: Invalid quantity format ({quantity_raw}) - must be a positive number")
+                return {"skip": True, "errors": errors, "warnings": warnings}
+            
+            # Enhanced revenue extraction
+            revenue = None
+            unit_price = None
+            if 'revenue' in column_mapping and column_mapping['revenue'] in row:
+                revenue_raw = row.get(column_mapping['revenue'])
+                if revenue_raw and not pd.isna(revenue_raw) and str(revenue_raw).lower() not in ['nan', 'null', 'none', '']:
+                    try:
+                        revenue = float(str(revenue_raw).replace(',', '.').replace('€', '').replace('$', '').strip())
+                        if revenue < 0:
+                            # STRICT: Treat negative revenue as ERROR, not warning
+                            errors.append(f"Row {row_number}: Negative revenue ({revenue}) - revenue must be positive or zero")
+                            return {"skip": True, "errors": errors, "warnings": warnings}
+                        else:
+                            # STRICT: Check for unrealistic revenue values
+                            if self.STRICT_VALIDATION and revenue > self.MAX_REVENUE_PER_ITEM:
+                                errors.append(f"Row {row_number}: Unrealistic revenue ({revenue}) - exceeds maximum expected value ({self.MAX_REVENUE_PER_ITEM})")
+                                return {"skip": True, "errors": errors, "warnings": warnings}
+                            
+                            # Calculate unit price if we have both revenue and quantity
+                            unit_price = revenue / quantity if quantity > 0 else None
+                            
+                            # STRICT: Validate unit price reasonableness
+                            if unit_price and unit_price > 10000:  # More than €10,000 per unit seems unrealistic for bakery
+                                errors.append(f"Row {row_number}: Unrealistic unit price ({unit_price:.2f}) - check quantity and revenue values")
+                                return {"skip": True, "errors": errors, "warnings": warnings}
+                    except (ValueError, TypeError):
+                        # STRICT: Treat invalid revenue format as ERROR
+                        errors.append(f"Row {row_number}: Invalid revenue format ({revenue_raw}) - must be a valid number")
+                        return {"skip": True, "errors": errors, "warnings": warnings}
+            
+            # Enhanced location extraction
+            location_id = None
+            if 'location' in column_mapping and column_mapping['location'] in row:
+                location_raw = row.get(column_mapping['location'])
+                if location_raw and not pd.isna(location_raw) and str(location_raw).lower() not in ['nan', 'null', 'none', '']:
+                    location_id = str(location_raw).strip()
+            
+            # Enhanced product category extraction
+            product_category = None
+            if 'category' in column_mapping and column_mapping['category'] in row:
+                category_raw = row.get(column_mapping['category'])
+                if category_raw and not pd.isna(category_raw) and str(category_raw).lower() not in ['nan', 'null', 'none', '']:
+                    product_category = str(category_raw).strip()
+            
+            return {
+                "skip": False,
+                "date": parsed_date,
+                "product_name": product_name,
+                "product_category": product_category,
+                "quantity_sold": quantity,
+                "unit_price": unit_price,
+                "revenue": revenue,
+                "location_id": location_id,
+                "errors": errors,
+                "warnings": warnings
+            }
+            
+        except Exception as e:
+            errors.append(f"Row {row_number}: Enhanced parsing error: {str(e)}")
+            return {"skip": True, "errors": errors, "warnings": warnings}
+    
+    def _detect_columns(self, columns: List[str]) -> Dict[str, str]:
+        """Enhanced column detection with fuzzy matching"""
+        mapping = {}
+        columns_lower = [col.lower().strip() for col in columns]
+        
+        for standard_name, possible_names in self.COLUMN_MAPPINGS.items():
+            best_match = None
+            best_score = 0
+            
+            for col_idx, col in enumerate(columns_lower):
+                for possible in possible_names:
+                    # Exact match (highest priority)
+                    if possible == col:
+                        best_match = columns[col_idx]
+                        best_score = 100
+                        break
+                    # Contains match
+                    elif possible in col or col in possible:
+                        score = len(possible) / len(col) * 90
+                        if score > best_score:
+                            best_match = columns[col_idx]
+                            best_score = score
+                
+                if best_score == 100:  # Found exact match
+                    break
+            
+            if best_match and best_score > 70:  # Threshold for matches
+                mapping[standard_name] = best_match
+        
+        # Enhanced alias mapping
+        if 'product' not in mapping and 'product_name' in mapping:
+            mapping['product'] = mapping['product_name']
+        if 'quantity' not in mapping and 'quantity_sold' in mapping:
+            mapping['quantity'] = mapping['quantity_sold']
+        if 'location' not in mapping and 'location_id' in mapping:
+            mapping['location'] = mapping['location_id']
+        
+        return mapping
+    
+    def _parse_date(self, date_str: str) -> Optional[datetime]:
+        """Enhanced date parsing with pandas and multiple format support"""
+        if not date_str or str(date_str).lower() in ['nan', 'null', 'none']:
+            return None
+        
+        date_str = str(date_str).strip()
+        
+        # Try pandas first (most robust)
+        try:
+            parsed_dt = pd.to_datetime(date_str, dayfirst=True)
+            if hasattr(parsed_dt, 'to_pydatetime'):
+                parsed_dt = parsed_dt.to_pydatetime()
+            
+            if parsed_dt.tzinfo is None:
+                parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
+            
+            return parsed_dt
+        except Exception:
+            pass
+        
+        # Try specific formats as fallback
+        for fmt in self.DATE_FORMATS:
+            try:
+                parsed_dt = datetime.strptime(date_str, fmt)
+                if parsed_dt.tzinfo is None:
+                    parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
+                return parsed_dt
+            except ValueError:
+                continue
+        
+        logger.warning(f"Could not parse date: {date_str}")
+        return None
+    
+    def _clean_product_name(self, product_name: str) -> str:
+        """Enhanced product name cleaning and standardization"""
+        if not product_name:
+            return "Producto sin nombre"
+        
+        # Remove extra whitespace
+        cleaned = re.sub(r'\s+', ' ', str(product_name).strip())
+        
+        # Remove special characters but keep Spanish characters
+        cleaned = re.sub(r'[^\w\s\-áéíóúñçüÁÉÍÓÚÑÇÜ]', '', cleaned)
+        
+        # Capitalize first letter of each word
+        cleaned = cleaned.title()
+        
+        # Enhanced corrections for Spanish bakeries
+        replacements = {
+            'Pan De': 'Pan de',
+            'Café Con': 'Café con',
+            'Te ': 'Té ',
+            'Bocadillo De': 'Bocadillo de',
+            'Dulce De': 'Dulce de',
+            'Tarta De': 'Tarta de',
+        }
+        
+        for old, new in replacements.items():
+            cleaned = cleaned.replace(old, new)
+        
+        return cleaned if cleaned else "Producto sin nombre"
+    
+    def _structure_messages(self, messages: List[Union[str, Dict]]) -> List[Dict[str, Any]]:
+        """Convert string messages to structured format"""
+        structured = []
+        for msg in messages:
+            if isinstance(msg, str):
+                structured.append({
+                    "type": "general_message",
+                    "message": msg,
+                    "field": None,
+                    "row": None,
+                    "code": "GENERAL_MESSAGE"
+                })
+            else:
+                structured.append(msg)
+        return structured
+    
+    def _generate_suggestions(
+        self, 
+        validation_result: SalesValidationResult, 
+        format_type: str, 
+        warning_count: int
+    ) -> List[str]:
+        """Generate enhanced contextual suggestions"""
+        suggestions = []
+        
+        if validation_result.is_valid:
+            suggestions.append("El archivo está listo para procesamiento")
+            suggestions.append(f"Se procesarán aproximadamente {validation_result.total_records} registros")
+            
+            if validation_result.total_records > 1000:
+                suggestions.append("Archivo grande: el procesamiento puede tomar varios minutos")
+                suggestions.append("Considera dividir archivos muy grandes para mejor rendimiento")
+            
+            if warning_count > 0:
+                suggestions.append("Revisa las advertencias antes de continuar")
+                suggestions.append("Los datos con advertencias se procesarán con valores por defecto")
+                
+            # Format-specific suggestions
+            if format_type == "csv":
+                suggestions.append("Asegúrate de que las fechas estén en formato DD/MM/YYYY")
+                suggestions.append("Verifica que los números usen punto decimal (no coma)")
+            elif format_type in ["excel", "xlsx"]:
+                suggestions.append("Solo se procesará la primera hoja del archivo Excel")
+                suggestions.append("Evita celdas combinadas y fórmulas complejas")
+        else:
+            suggestions.append("Corrige los errores antes de continuar")
+            suggestions.append("Verifica que el archivo tenga el formato correcto")
+            
+            if format_type not in ["csv", "excel", "xlsx", "json"]:
+                suggestions.append("Usa formato CSV o Excel para mejores resultados")
+                suggestions.append("El formato JSON es para usuarios avanzados")
+            
+            if validation_result.total_records == 0:
+                suggestions.append("Asegúrate de que el archivo contenga datos")
+                suggestions.append("Verifica que el archivo no esté corrupto")
+                
+            # Missing column suggestions
+            error_codes = [error.get("code", "") for error in validation_result.errors if isinstance(error, dict)]
+            if "MISSING_DATE_COLUMN" in error_codes:
+                suggestions.append("Incluye una columna de fecha (fecha, date, dia)")
+            if "MISSING_PRODUCT_COLUMN" in error_codes:
+                suggestions.append("Incluye una columna de producto (producto, product, item)")
+        
+        return suggestions
+
+
+# Main DataImportService class with enhanced functionality