Improve the sales import

2025-10-15 21:09:42 +02:00
parent 8f9e9a7edc
commit dbb48d8e2c
21 changed files with 992 additions and 409 deletions
--- a/services/sales/app/repositories/sales_repository.py
+++ b/services/sales/app/repositories/sales_repository.py
@@ -265,18 +265,60 @@ class SalesRepository(BaseRepository[SalesData, SalesDataCreate, SalesDataUpdate
            record = await self.get_by_id(record_id)
            if not record:
                raise ValueError(f"Sales record {record_id} not found")
-            
+
            update_data = {
                'is_validated': True,
                'validation_notes': validation_notes
            }
-            
+
            updated_record = await self.update(record_id, update_data)
-            
+
            logger.info("Validated sales record", record_id=record_id)
            return updated_record
-            
+
        except Exception as e:
            logger.error("Failed to validate sales record", error=str(e), record_id=record_id)
            raise
-        
+
+    async def create_sales_records_bulk(
+        self,
+        sales_data_list: List[SalesDataCreate],
+        tenant_id: UUID
+    ) -> int:
+        """Bulk insert sales records for performance optimization"""
+        try:
+            from uuid import uuid4
+
+            records = []
+            for sales_data in sales_data_list:
+                is_weekend = sales_data.date.weekday() >= 5 if sales_data.date else False
+
+                record = SalesData(
+                    id=uuid4(),
+                    tenant_id=tenant_id,
+                    date=sales_data.date,
+                    inventory_product_id=sales_data.inventory_product_id,
+                    quantity_sold=sales_data.quantity_sold,
+                    unit_price=sales_data.unit_price,
+                    revenue=sales_data.revenue,
+                    location_id=sales_data.location_id,
+                    sales_channel=sales_data.sales_channel,
+                    source=sales_data.source,
+                    is_weekend=is_weekend,
+                    is_validated=getattr(sales_data, 'is_validated', False)
+                )
+                records.append(record)
+
+            self.session.add_all(records)
+            await self.session.flush()
+
+            logger.info(
+                "Bulk created sales records",
+                count=len(records),
+                tenant_id=tenant_id
+            )
+            return len(records)
+
+        except Exception as e:
+            logger.error("Failed to bulk create sales records", error=str(e), tenant_id=tenant_id)
+            raise
--- a/services/sales/app/services/data_import_service.py
+++ b/services/sales/app/services/data_import_service.py
@@ -442,17 +442,17 @@ class DataImportService:
            )
    
    async def _process_csv_data(
-        self, 
-        tenant_id: str, 
-        csv_content: str, 
-        repository: SalesRepository, 
+        self,
+        tenant_id: str,
+        csv_content: str,
+        repository: SalesRepository,
        filename: Optional[str] = None
    ) -> Dict[str, Any]:
-        """Enhanced CSV processing with batch product resolution for better reliability"""
+        """Optimized CSV processing with true batch operations"""
        try:
            reader = csv.DictReader(io.StringIO(csv_content))
            rows = list(reader)
-            
+
            if not rows:
                return {
                    "success": False,
@@ -461,19 +461,18 @@ class DataImportService:
                    "errors": ["CSV file is empty"],
                    "warnings": []
                }
-            
-            # Enhanced column mapping
+
            column_mapping = self._detect_columns(list(rows[0].keys()))
-            
-            # Pre-process to extract unique products for batch creation
+
            unique_products = set()
            parsed_rows = []
-            
-            logger.info(f"Pre-processing {len(rows)} records to identify unique products")
-            
+            errors = []
+            warnings = []
+
+            logger.info(f"Parsing {len(rows)} CSV records")
+
            for index, row in enumerate(rows):
                try:
-                    # Enhanced data parsing and validation
                    parsed_data = await self._parse_row_data(row, column_mapping, index + 1)
                    if not parsed_data.get("skip"):
                        unique_products.add((
@@ -481,38 +480,52 @@ class DataImportService:
                            parsed_data.get("product_category", "general")
                        ))
                        parsed_rows.append((index, parsed_data))
+                    else:
+                        errors.extend(parsed_data.get("errors", []))
+                        warnings.extend(parsed_data.get("warnings", []))
                except Exception as e:
                    logger.warning(f"Failed to parse row {index + 1}: {e}")
+                    errors.append(f"Row {index + 1}: Parse error - {str(e)}")
                    continue
-            
-            logger.info(f"Found {len(unique_products)} unique products, attempting batch resolution")
-            
-            # Try to resolve/create all unique products in batch
-            await self._batch_resolve_products(unique_products, tenant_id)
-            
-            # Now process the actual sales records
-            records_created = 0
-            errors = []
-            warnings = []
-            
-            logger.info(f"Processing {len(parsed_rows)} validated records for sales creation")
-            
+
+            logger.info(f"Batch resolving {len(unique_products)} unique products")
+
+            products_batch = [
+                {"name": name, "category": category}
+                for name, category in unique_products
+            ]
+
+            batch_result = await self.inventory_client.resolve_or_create_products_batch(
+                products_batch,
+                tenant_id
+            )
+
+            if batch_result and 'product_mappings' in batch_result:
+                self.product_cache.update(batch_result['product_mappings'])
+                logger.info(f"Resolved {len(batch_result['product_mappings'])} products in single batch call")
+            else:
+                logger.error("Batch product resolution failed")
+                return {
+                    "success": False,
+                    "total_rows": len(rows),
+                    "records_created": 0,
+                    "errors": ["Failed to resolve products in inventory"],
+                    "warnings": warnings
+                }
+
+            sales_records_batch = []
+
            for index, parsed_data in parsed_rows:
+                product_name = parsed_data["product_name"]
+
+                if product_name not in self.product_cache:
+                    errors.append(f"Row {index + 1}: Product '{product_name}' not found in cache")
+                    continue
+
                try:
-                    # Resolve product name to inventory_product_id (should be cached now)
-                    inventory_product_id = await self._resolve_product_to_inventory_id(
-                        parsed_data["product_name"], 
-                        parsed_data.get("product_category"),
-                        tenant_id
-                    )
-                    
-                    if not inventory_product_id:
-                        error_msg = f"Row {index + 1}: Could not resolve product '{parsed_data['product_name']}' to inventory ID"
-                        errors.append(error_msg)
-                        logger.warning("Product resolution failed", error=error_msg)
-                        continue
-                    
-                    # Create sales record with enhanced data
+                    from uuid import UUID
+                    inventory_product_id = UUID(self.product_cache[product_name])
+
                    sales_data = SalesDataCreate(
                        tenant_id=tenant_id,
                        date=parsed_data["date"],
@@ -523,32 +536,35 @@ class DataImportService:
                        location_id=parsed_data.get("location_id"),
                        source="csv"
                    )
-                    
-                    created_record = await repository.create_sales_record(sales_data, tenant_id)
-                    records_created += 1
-                    
-                    # Enhanced progress logging
-                    if records_created % 100 == 0:
-                        logger.info(f"Enhanced processing: {records_created}/{len(rows)} records completed...")
-                        
+
+                    sales_records_batch.append(sales_data)
+
                except Exception as e:
-                    error_msg = f"Row {index + 1}: {str(e)}"
-                    errors.append(error_msg)
-                    logger.warning("Enhanced record processing failed", error=error_msg)
-            
+                    errors.append(f"Row {index + 1}: {str(e)}")
+                    continue
+
+            if sales_records_batch:
+                logger.info(f"Bulk inserting {len(sales_records_batch)} sales records")
+                records_created = await repository.create_sales_records_bulk(
+                    sales_records_batch,
+                    tenant_id
+                )
+            else:
+                records_created = 0
+
            success_rate = (records_created / len(rows)) * 100 if rows else 0
-            
+
            return {
                "success": records_created > 0,
                "total_rows": len(rows),
                "records_created": records_created,
                "success_rate": success_rate,
-                "errors": errors,
-                "warnings": warnings
+                "errors": errors[:50],
+                "warnings": warnings[:50]
            }
-            
+
        except Exception as e:
-            logger.error("Enhanced CSV processing failed", error=str(e))
+            logger.error("CSV processing failed", error=str(e))
            raise
    
    async def _process_json_data(
@@ -633,66 +649,95 @@ class DataImportService:
            raise
    
    async def _process_dataframe(
-        self, 
-        tenant_id: str, 
-        df: pd.DataFrame, 
+        self,
+        tenant_id: str,
+        df: pd.DataFrame,
        repository: SalesRepository,
        source: str,
        filename: Optional[str] = None
    ) -> Dict[str, Any]:
-        """Enhanced DataFrame processing with better error handling"""
+        """Optimized DataFrame processing with batch operations"""
        try:
-            # Enhanced column mapping
            column_mapping = self._detect_columns(df.columns.tolist())
-            
+
            if not column_mapping.get('date') or not column_mapping.get('product'):
                required_missing = []
                if not column_mapping.get('date'):
                    required_missing.append("date")
                if not column_mapping.get('product'):
                    required_missing.append("product")
-                
+
                raise ValueError(f"Required columns missing: {', '.join(required_missing)}")
-            
-            records_created = 0
+
+            unique_products = set()
+            parsed_rows = []
            errors = []
            warnings = []
-            
-            logger.info(f"Enhanced processing of {len(df)} records from {source}")
-            
+
+            logger.info(f"Processing {len(df)} records from {source}")
+
            for index, row in df.iterrows():
                try:
-                    # Convert pandas row to dict
                    row_dict = {}
                    for col in df.columns:
                        val = row[col]
-                        # Handle pandas NaN values
                        if pd.isna(val):
                            row_dict[col] = None
                        else:
                            row_dict[col] = val
-                    
-                    # Enhanced data parsing
+
                    parsed_data = await self._parse_row_data(row_dict, column_mapping, index + 1)
-                    if parsed_data.get("skip"):
+                    if not parsed_data.get("skip"):
+                        unique_products.add((
+                            parsed_data["product_name"],
+                            parsed_data.get("product_category", "general")
+                        ))
+                        parsed_rows.append((index, parsed_data))
+                    else:
                        errors.extend(parsed_data.get("errors", []))
                        warnings.extend(parsed_data.get("warnings", []))
-                        continue
-                    
-                    # Resolve product name to inventory_product_id
-                    inventory_product_id = await self._resolve_product_to_inventory_id(
-                        parsed_data["product_name"], 
-                        parsed_data.get("product_category"),
-                        tenant_id
-                    )
-                    
-                    if not inventory_product_id:
-                        error_msg = f"Row {index + 1}: Could not resolve product '{parsed_data['product_name']}' to inventory ID"
-                        errors.append(error_msg)
-                        logger.warning("Product resolution failed", error=error_msg)
-                        continue
-                    
-                    # Create enhanced sales record
+
+                except Exception as e:
+                    errors.append(f"Row {index + 1}: {str(e)}")
+                    continue
+
+            logger.info(f"Batch resolving {len(unique_products)} unique products")
+
+            products_batch = [
+                {"name": name, "category": category}
+                for name, category in unique_products
+            ]
+
+            batch_result = await self.inventory_client.resolve_or_create_products_batch(
+                products_batch,
+                tenant_id
+            )
+
+            if batch_result and 'product_mappings' in batch_result:
+                self.product_cache.update(batch_result['product_mappings'])
+                logger.info(f"Resolved {len(batch_result['product_mappings'])} products in batch")
+            else:
+                return {
+                    "success": False,
+                    "total_rows": len(df),
+                    "records_created": 0,
+                    "errors": ["Failed to resolve products"],
+                    "warnings": warnings
+                }
+
+            sales_records_batch = []
+
+            for index, parsed_data in parsed_rows:
+                product_name = parsed_data["product_name"]
+
+                if product_name not in self.product_cache:
+                    errors.append(f"Row {index + 1}: Product '{product_name}' not in cache")
+                    continue
+
+                try:
+                    from uuid import UUID
+                    inventory_product_id = UUID(self.product_cache[product_name])
+
                    sales_data = SalesDataCreate(
                        tenant_id=tenant_id,
                        date=parsed_data["date"],
@@ -703,34 +748,37 @@ class DataImportService:
                        location_id=parsed_data.get("location_id"),
                        source=source
                    )
-                    
-                    created_record = await repository.create_sales_record(sales_data, tenant_id)
-                    records_created += 1
-                    
-                    # Progress logging for large datasets
-                    if records_created % 100 == 0:
-                        logger.info(f"Enhanced DataFrame processing: {records_created}/{len(df)} records completed...")
-                        
+
+                    sales_records_batch.append(sales_data)
+
                except Exception as e:
-                    error_msg = f"Row {index + 1}: {str(e)}"
-                    errors.append(error_msg)
-                    logger.warning("Enhanced record processing failed", error=error_msg)
-            
+                    errors.append(f"Row {index + 1}: {str(e)}")
+                    continue
+
+            if sales_records_batch:
+                logger.info(f"Bulk inserting {len(sales_records_batch)} sales records")
+                records_created = await repository.create_sales_records_bulk(
+                    sales_records_batch,
+                    tenant_id
+                )
+            else:
+                records_created = 0
+
            success_rate = (records_created / len(df)) * 100 if len(df) > 0 else 0
-            
+
            return {
                "success": records_created > 0,
                "total_rows": len(df),
                "records_created": records_created,
                "success_rate": success_rate,
-                "errors": errors[:10],  # Limit errors for performance
-                "warnings": warnings[:10]  # Limit warnings
+                "errors": errors[:50],
+                "warnings": warnings[:50]
            }
-            
+
        except ValueError:
            raise
        except Exception as e:
-            logger.error("Enhanced DataFrame processing failed", error=str(e))
+            logger.error("DataFrame processing failed", error=str(e))
            raise
    
    async def _parse_row_data(
@@ -983,194 +1031,6 @@ class DataImportService:
        self.failed_products.clear()
        logger.info("Import cache cleared for new session")
    
-    async def _resolve_product_to_inventory_id(self, product_name: str, product_category: Optional[str], tenant_id: UUID) -> Optional[UUID]:
-        """Resolve a product name to an inventory_product_id via the inventory service with improved error handling and fallback"""
-        
-        # Check cache first
-        if product_name in self.product_cache:
-            logger.debug("Product resolved from cache", product_name=product_name, tenant_id=tenant_id)
-            return self.product_cache[product_name]
-        
-        # Skip if this product already failed to resolve after all attempts
-        if product_name in self.failed_products:
-            logger.debug("Skipping previously failed product", product_name=product_name, tenant_id=tenant_id)
-            return None
-        
-        max_retries = 5  # Increased retries
-        base_delay = 2.0  # Increased base delay
-        fallback_retry_delay = 10.0  # Longer delay for fallback attempts
-        
-        for attempt in range(max_retries):
-            try:
-                # Add progressive delay to avoid rate limiting
-                if attempt > 0:
-                    # Use longer delays for later attempts
-                    if attempt >= 3:
-                        delay = fallback_retry_delay  # Use fallback delay for later attempts
-                    else:
-                        delay = base_delay * (2 ** (attempt - 1))  # Exponential backoff
-                    
-                    logger.info(f"Retrying product resolution after {delay}s delay", 
-                              product_name=product_name, attempt=attempt, tenant_id=tenant_id)
-                    await asyncio.sleep(delay)
-                
-                # First try to search for existing product by name
-                try:
-                    products = await self.inventory_client.search_products(product_name, tenant_id)
-                    
-                    if products:
-                        # Return the first matching product's ID
-                        product_id = products[0].get('id')
-                        if product_id:
-                            uuid_id = UUID(str(product_id))
-                            self.product_cache[product_name] = uuid_id  # Cache for future use
-                            logger.info("Resolved product to existing inventory ID", 
-                                      product_name=product_name, product_id=product_id, tenant_id=tenant_id)
-                            return uuid_id
-                except Exception as search_error:
-                    logger.warning("Product search failed, trying direct creation", 
-                                 product_name=product_name, error=str(search_error), tenant_id=tenant_id)
-                
-                # Add delay before creation attempt to avoid hitting rate limits
-                await asyncio.sleep(1.0)
-                
-                # If not found or search failed, create a new ingredient/product in inventory
-                ingredient_data = {
-                    'name': product_name,
-                    'type': 'finished_product',  # Assuming sales are of finished products
-                    'unit': 'unit',  # Default unit
-                    'current_stock': 0,  # No stock initially
-                    'reorder_point': 0,
-                    'cost_per_unit': 0,
-                    'category': product_category or 'general'
-                }
-                
-                try:
-                    created_product = await self.inventory_client.create_ingredient(ingredient_data, str(tenant_id))
-                    if created_product and created_product.get('id'):
-                        product_id = created_product['id']
-                        uuid_id = UUID(str(product_id))
-                        self.product_cache[product_name] = uuid_id  # Cache for future use
-                        logger.info("Created new inventory product for sales data", 
-                                  product_name=product_name, product_id=product_id, tenant_id=tenant_id)
-                        return uuid_id
-                except Exception as creation_error:
-                    logger.warning("Product creation failed", 
-                                 product_name=product_name, error=str(creation_error), tenant_id=tenant_id)
-                
-                logger.warning("Failed to resolve or create product in inventory", 
-                             product_name=product_name, tenant_id=tenant_id, attempt=attempt)
-                             
-            except Exception as e:
-                error_str = str(e)
-                if "429" in error_str or "rate limit" in error_str.lower() or "too many requests" in error_str.lower():
-                    logger.warning("Rate limit or service overload detected, retrying with longer delay", 
-                                 product_name=product_name, attempt=attempt, error=error_str, tenant_id=tenant_id)
-                    if attempt < max_retries - 1:
-                        continue  # Retry with exponential backoff
-                elif "503" in error_str or "502" in error_str or "service unavailable" in error_str.lower():
-                    logger.warning("Service unavailable, retrying with backoff", 
-                                 product_name=product_name, attempt=attempt, error=error_str, tenant_id=tenant_id)
-                    if attempt < max_retries - 1:
-                        continue  # Retry for service unavailable errors
-                elif "timeout" in error_str.lower() or "connection" in error_str.lower():
-                    logger.warning("Network issue detected, retrying", 
-                                 product_name=product_name, attempt=attempt, error=error_str, tenant_id=tenant_id)
-                    if attempt < max_retries - 1:
-                        continue  # Retry for network issues
-                else:
-                    logger.error("Non-retryable error resolving product to inventory ID", 
-                               error=error_str, product_name=product_name, tenant_id=tenant_id)
-                    if attempt < max_retries - 1:
-                        # Still retry even for other errors, in case it's transient
-                        continue
-                    else:
-                        break  # Don't retry on final attempt
-        
-        # If all retries failed, log detailed error but don't mark as permanently failed yet
-        # Instead, we'll implement a fallback mechanism
-        logger.error("Failed to resolve product after all retries, attempting fallback", 
-                   product_name=product_name, tenant_id=tenant_id)
-        
-        # FALLBACK: Try to create a temporary product with minimal data
-        try:
-            # Use a simplified approach with minimal data
-            fallback_data = {
-                'name': product_name,
-                'type': 'finished_product',
-                'unit': 'unit',
-                'current_stock': 0,
-                'cost_per_unit': 0
-            }
-            
-            logger.info("Attempting fallback product creation with minimal data", 
-                      product_name=product_name, tenant_id=tenant_id)
-            
-            created_product = await self.inventory_client.create_ingredient(fallback_data, str(tenant_id))
-            if created_product and created_product.get('id'):
-                product_id = created_product['id']
-                uuid_id = UUID(str(product_id))
-                self.product_cache[product_name] = uuid_id
-                logger.info("SUCCESS: Fallback product creation succeeded", 
-                          product_name=product_name, product_id=product_id, tenant_id=tenant_id)
-                return uuid_id
-        except Exception as fallback_error:
-            logger.error("Fallback product creation also failed", 
-                       product_name=product_name, error=str(fallback_error), tenant_id=tenant_id)
-        
-        # Only mark as permanently failed after all attempts including fallback
-        self.failed_products.add(product_name)
-        logger.error("CRITICAL: Permanently failed to resolve product - this will result in missing training data", 
-                   product_name=product_name, tenant_id=tenant_id)
-        return None
-    
-    async def _batch_resolve_products(self, unique_products: set, tenant_id: str) -> None:
-        """Batch resolve/create products to reduce API calls and improve success rate"""
-        
-        if not unique_products:
-            return
-        
-        logger.info(f"Starting batch product resolution for {len(unique_products)} unique products")
-        
-        # Convert set to list for easier handling
-        products_list = list(unique_products)
-        batch_size = 5  # Process in smaller batches to avoid overwhelming the inventory service
-        
-        for i in range(0, len(products_list), batch_size):
-            batch = products_list[i:i + batch_size]
-            logger.info(f"Processing batch {i//batch_size + 1}/{(len(products_list) + batch_size - 1)//batch_size}")
-            
-            # Process each product in the batch with retry logic
-            for product_name, product_category in batch:
-                try:
-                    # Skip if already in cache or failed list
-                    if product_name in self.product_cache or product_name in self.failed_products:
-                        continue
-                    
-                    # Try to resolve the product
-                    await self._resolve_product_to_inventory_id(product_name, product_category, tenant_id)
-                    
-                    # Add small delay between products to be gentle on the API
-                    await asyncio.sleep(0.5)
-                    
-                except Exception as e:
-                    logger.warning(f"Failed to batch process product {product_name}: {e}")
-                    continue
-            
-            # Add delay between batches
-            if i + batch_size < len(products_list):
-                logger.info("Waiting between batches to avoid rate limiting...")
-                await asyncio.sleep(2.0)
-        
-        successful_resolutions = len([p for p, _ in products_list if p in self.product_cache])
-        failed_resolutions = len([p for p, _ in products_list if p in self.failed_products])
-        
-        logger.info(f"Batch product resolution completed: {successful_resolutions} successful, {failed_resolutions} failed")
-        
-        if failed_resolutions > 0:
-            logger.warning(f"ATTENTION: {failed_resolutions} products failed to resolve - these will be missing from training data")
-            
-        return
    
    def _structure_messages(self, messages: List[Union[str, Dict]]) -> List[Dict[str, Any]]:
        """Convert string messages to structured format"""
--- a/services/sales/app/services/inventory_client.py
+++ b/services/sales/app/services/inventory_client.py
@@ -123,15 +123,35 @@ class InventoryServiceClient:
        try:
            result = await self._shared_client.create_ingredient(ingredient_data, tenant_id)
            if result:
-                logger.info("Created ingredient in inventory service", 
+                logger.info("Created ingredient in inventory service",
                          ingredient_name=ingredient_data.get('name'), tenant_id=tenant_id)
            return result
-            
+
        except Exception as e:
-            logger.error("Error creating ingredient", 
+            logger.error("Error creating ingredient",
                       error=str(e), ingredient_data=ingredient_data, tenant_id=tenant_id)
            return None

+    async def resolve_or_create_products_batch(
+        self,
+        products: List[Dict[str, Any]],
+        tenant_id: str
+    ) -> Optional[Dict[str, Any]]:
+        """Resolve or create multiple products in a single batch operation"""
+        try:
+            result = await self._shared_client.resolve_or_create_products_batch(products, tenant_id)
+            if result:
+                logger.info("Batch product resolution complete",
+                          created=result.get('created_count', 0),
+                          resolved=result.get('resolved_count', 0),
+                          tenant_id=tenant_id)
+            return result
+
+        except Exception as e:
+            logger.error("Error in batch product resolution",
+                       error=str(e), products_count=len(products), tenant_id=tenant_id)
+            return None
+
 # Dependency injection
 async def get_inventory_client() -> InventoryServiceClient:
    """Get inventory service client instance"""