Fix issues 3

This commit is contained in:
Urtzi Alfaro
2025-08-17 13:35:05 +02:00
parent d21094a940
commit cafd316c4b
5 changed files with 157 additions and 49 deletions

View File

@@ -865,34 +865,65 @@ class DataImportService:
return mapping
def _parse_date(self, date_str: str) -> Optional[datetime]:
"""Enhanced date parsing with pandas and multiple format support"""
"""Enhanced date parsing with explicit format handling for CSV dates"""
if not date_str or str(date_str).lower() in ['nan', 'null', 'none']:
return None
date_str = str(date_str).strip()
# Try pandas first (most robust)
# For CSV format like "2024/10/01", try specific formats first to avoid ambiguity
# Priority order: YYYY/MM/DD (most likely for machine-generated data)
priority_formats = [
'%Y/%m/%d', # 2024/10/01 (October 1, 2024) - most likely for CSV exports
'%Y-%m-%d', # 2024-10-01
'%d/%m/%Y', # 01/10/2024 (European format)
'%m/%d/%Y', # 10/01/2024 (US format)
]
# Try priority formats first
for fmt in priority_formats:
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using format '{fmt}' -> {parsed_dt}")
return parsed_dt
except ValueError:
continue
# Try pandas as fallback with explicit format inference
try:
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
# For YYYY/MM/DD format, disable dayfirst to prevent misinterpretation
if '/' in date_str and len(date_str.split('/')[0]) == 4:
# Looks like YYYY/MM/DD format, so don't use dayfirst
parsed_dt = pd.to_datetime(date_str, dayfirst=False)
else:
# For other formats, use dayfirst=True for European-style dates
parsed_dt = pd.to_datetime(date_str, dayfirst=True)
if hasattr(parsed_dt, 'to_pydatetime'):
parsed_dt = parsed_dt.to_pydatetime()
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using pandas -> {parsed_dt}")
return parsed_dt
except Exception:
except Exception as e:
logger.debug(f"Pandas date parsing failed for '{date_str}': {e}")
pass
# Try specific formats as fallback
# Try remaining formats as last fallback
for fmt in self.DATE_FORMATS:
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
return parsed_dt
except ValueError:
continue
if fmt not in priority_formats: # Skip already tried formats
try:
parsed_dt = datetime.strptime(date_str, fmt)
if parsed_dt.tzinfo is None:
parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
logger.debug(f"Successfully parsed date '{date_str}' using fallback format '{fmt}' -> {parsed_dt}")
return parsed_dt
except ValueError:
continue
logger.warning(f"Could not parse date: {date_str}")
return None

View File

@@ -176,36 +176,65 @@ class TrainingDataOrchestrator:
logger.error(f"Training data preparation failed: {str(e)}")
raise ValueError(f"Failed to prepare training data: {str(e)}")
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> DateRange:
"""Extract date range from sales data with timezone handling and strict date format."""
if not sales_data:
raise ValueError("No sales data provided")
def extract_sales_date_range_utc_localize(sales_data_df: pd.DataFrame):
"""
Extracts the UTC-aware date range from a sales DataFrame using tz_localize.
dates = []
for record in sales_data:
date_value = record.get('date')
if not date_value:
continue # Skip records with missing date
Args:
sales_data_df: A pandas DataFrame containing a 'date' column.
if isinstance(date_value, str):
# Parse string with explicit format
dt = pd.to_datetime(date_value, format='mixed', errors='raise')
if dt.tz is None:
dt = dt.tz_localize('UTC') # Assign UTC timezone if none
dates.append(dt.to_pydatetime())
elif isinstance(date_value, datetime):
if date_value.tzinfo is None:
date_value = date_value.replace(tzinfo=timezone.utc)
dates.append(date_value)
else:
continue
Returns:
A tuple of timezone-aware start and end dates in UTC.
"""
if 'date' not in sales_data_df.columns:
raise ValueError("DataFrame does not contain a 'date' column.")
if not dates:
# Convert the 'date' column to datetime objects
sales_data_df['date'] = pd.to_datetime(sales_data_df['date'])
# Localize the naive datetime objects to UTC
sales_data_df['date'] = sales_data_df['date'].tz_localize('UTC')
# Find the minimum and maximum dates
start_date = sales_data_df['date'].min()
end_date = sales_data_df['date'].max()
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> 'DateRange':
"""
Extract date range from sales data with proper date parsing
Args:
sales_data: List of sales records
Returns:
DateRange object with timezone-aware start and end dates
"""
if not sales_data:
raise ValueError("No sales data provided for date range extraction")
# Convert to DataFrame for easier processing
sales_df = pd.DataFrame(sales_data)
if 'date' not in sales_df.columns:
raise ValueError("Sales data does not contain a 'date' column")
# Convert dates to datetime with proper parsing
# This will use the improved date parsing from the data import service
sales_df['date'] = pd.to_datetime(sales_df['date'], utc=True, errors='coerce')
# Remove any rows with invalid dates
sales_df = sales_df.dropna(subset=['date'])
if len(sales_df) == 0:
raise ValueError("No valid dates found in sales data")
start_date = min(dates)
end_date = max(dates)
# Find the minimum and maximum dates
start_date = sales_df['date'].min()
end_date = sales_df['date'].max()
logger.info(f"Extracted sales date range: {start_date} to {end_date}")
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)

View File

@@ -6,7 +6,8 @@ Main training service that uses the repository pattern for data access
from typing import Dict, List, Any, Optional
import uuid
import structlog
from datetime import datetime
from datetime import datetime, date, timezone
from decimal import Decimal
from sqlalchemy.ext.asyncio import AsyncSession
import json
import numpy as np
@@ -37,10 +38,26 @@ logger = structlog.get_logger()
def make_json_serializable(obj):
"""Convert numpy/pandas types, datetime, and UUID objects to JSON-serializable Python types"""
import uuid
from decimal import Decimal
from datetime import datetime, date
# Handle None values
if obj is None:
return None
# Handle basic datetime types first (most common)
if isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, date):
return obj.isoformat()
# Handle pandas timestamp types
if hasattr(pd, 'Timestamp') and isinstance(obj, pd.Timestamp):
return obj.isoformat()
# Handle numpy datetime types
if hasattr(np, 'datetime64') and isinstance(obj, np.datetime64):
return pd.Timestamp(obj).isoformat()
# Handle numeric types
if isinstance(obj, (np.integer, pd.Int64Dtype)):
return int(obj)
elif isinstance(obj, (np.floating, pd.Float64Dtype)):
@@ -51,19 +68,36 @@ def make_json_serializable(obj):
return obj.tolist()
elif isinstance(obj, pd.DataFrame):
return obj.to_dict('records')
elif isinstance(obj, Decimal):
return float(obj)
# Handle UUID types
elif isinstance(obj, uuid.UUID):
return str(obj)
elif hasattr(obj, '__class__') and 'UUID' in str(obj.__class__):
# Handle any UUID-like objects (including asyncpg.pgproto.pgproto.UUID)
return str(obj)
elif isinstance(obj, Decimal):
return float(obj)
# Handle collections recursively
elif isinstance(obj, dict):
return {k: make_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
elif isinstance(obj, (list, tuple)):
return [make_json_serializable(item) for item in obj]
else:
elif isinstance(obj, set):
return [make_json_serializable(item) for item in obj]
# Handle other common types
elif isinstance(obj, (str, int, float, bool)):
return obj
# Last resort: try to convert to string
else:
try:
# For any other object, try to convert to string
return str(obj)
except Exception:
# If all else fails, return None
return None
class EnhancedTrainingService: