Fix issues 3

This commit is contained in:
Urtzi Alfaro
2025-08-17 13:35:05 +02:00
parent d21094a940
commit cafd316c4b
5 changed files with 157 additions and 49 deletions

View File

@@ -176,36 +176,65 @@ class TrainingDataOrchestrator:
logger.error(f"Training data preparation failed: {str(e)}")
raise ValueError(f"Failed to prepare training data: {str(e)}")
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> DateRange:
"""Extract date range from sales data with timezone handling and strict date format."""
if not sales_data:
raise ValueError("No sales data provided")
def extract_sales_date_range_utc_localize(sales_data_df: pd.DataFrame):
"""
Extracts the UTC-aware date range from a sales DataFrame using tz_localize.
dates = []
for record in sales_data:
date_value = record.get('date')
if not date_value:
continue # Skip records with missing date
Args:
sales_data_df: A pandas DataFrame containing a 'date' column.
if isinstance(date_value, str):
# Parse string with explicit format
dt = pd.to_datetime(date_value, format='mixed', errors='raise')
if dt.tz is None:
dt = dt.tz_localize('UTC') # Assign UTC timezone if none
dates.append(dt.to_pydatetime())
elif isinstance(date_value, datetime):
if date_value.tzinfo is None:
date_value = date_value.replace(tzinfo=timezone.utc)
dates.append(date_value)
else:
continue
Returns:
A tuple of timezone-aware start and end dates in UTC.
"""
if 'date' not in sales_data_df.columns:
raise ValueError("DataFrame does not contain a 'date' column.")
if not dates:
# Convert the 'date' column to datetime objects
sales_data_df['date'] = pd.to_datetime(sales_data_df['date'])
# Localize the naive datetime objects to UTC
sales_data_df['date'] = sales_data_df['date'].tz_localize('UTC')
# Find the minimum and maximum dates
start_date = sales_data_df['date'].min()
end_date = sales_data_df['date'].max()
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> 'DateRange':
"""
Extract date range from sales data with proper date parsing
Args:
sales_data: List of sales records
Returns:
DateRange object with timezone-aware start and end dates
"""
if not sales_data:
raise ValueError("No sales data provided for date range extraction")
# Convert to DataFrame for easier processing
sales_df = pd.DataFrame(sales_data)
if 'date' not in sales_df.columns:
raise ValueError("Sales data does not contain a 'date' column")
# Convert dates to datetime with proper parsing
# This will use the improved date parsing from the data import service
sales_df['date'] = pd.to_datetime(sales_df['date'], utc=True, errors='coerce')
# Remove any rows with invalid dates
sales_df = sales_df.dropna(subset=['date'])
if len(sales_df) == 0:
raise ValueError("No valid dates found in sales data")
start_date = min(dates)
end_date = max(dates)
# Find the minimum and maximum dates
start_date = sales_df['date'].min()
end_date = sales_df['date'].max()
logger.info(f"Extracted sales date range: {start_date} to {end_date}")
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)

View File

@@ -6,7 +6,8 @@ Main training service that uses the repository pattern for data access
from typing import Dict, List, Any, Optional
import uuid
import structlog
from datetime import datetime
from datetime import datetime, date, timezone
from decimal import Decimal
from sqlalchemy.ext.asyncio import AsyncSession
import json
import numpy as np
@@ -37,10 +38,26 @@ logger = structlog.get_logger()
def make_json_serializable(obj):
"""Convert numpy/pandas types, datetime, and UUID objects to JSON-serializable Python types"""
import uuid
from decimal import Decimal
from datetime import datetime, date
# Handle None values
if obj is None:
return None
# Handle basic datetime types first (most common)
if isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, date):
return obj.isoformat()
# Handle pandas timestamp types
if hasattr(pd, 'Timestamp') and isinstance(obj, pd.Timestamp):
return obj.isoformat()
# Handle numpy datetime types
if hasattr(np, 'datetime64') and isinstance(obj, np.datetime64):
return pd.Timestamp(obj).isoformat()
# Handle numeric types
if isinstance(obj, (np.integer, pd.Int64Dtype)):
return int(obj)
elif isinstance(obj, (np.floating, pd.Float64Dtype)):
@@ -51,19 +68,36 @@ def make_json_serializable(obj):
return obj.tolist()
elif isinstance(obj, pd.DataFrame):
return obj.to_dict('records')
elif isinstance(obj, Decimal):
return float(obj)
# Handle UUID types
elif isinstance(obj, uuid.UUID):
return str(obj)
elif hasattr(obj, '__class__') and 'UUID' in str(obj.__class__):
# Handle any UUID-like objects (including asyncpg.pgproto.pgproto.UUID)
return str(obj)
elif isinstance(obj, Decimal):
return float(obj)
# Handle collections recursively
elif isinstance(obj, dict):
return {k: make_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
elif isinstance(obj, (list, tuple)):
return [make_json_serializable(item) for item in obj]
else:
elif isinstance(obj, set):
return [make_json_serializable(item) for item in obj]
# Handle other common types
elif isinstance(obj, (str, int, float, bool)):
return obj
# Last resort: try to convert to string
else:
try:
# For any other object, try to convert to string
return str(obj)
except Exception:
# If all else fails, return None
return None
class EnhancedTrainingService: