Fix issues 3

This commit is contained in:
Urtzi Alfaro
2025-08-17 13:35:05 +02:00
parent d21094a940
commit cafd316c4b
5 changed files with 157 additions and 49 deletions

View File

@@ -176,36 +176,65 @@ class TrainingDataOrchestrator:
logger.error(f"Training data preparation failed: {str(e)}")
raise ValueError(f"Failed to prepare training data: {str(e)}")
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> DateRange:
"""Extract date range from sales data with timezone handling and strict date format."""
if not sales_data:
raise ValueError("No sales data provided")
def extract_sales_date_range_utc_localize(sales_data_df: pd.DataFrame):
"""
Extracts the UTC-aware date range from a sales DataFrame using tz_localize.
dates = []
for record in sales_data:
date_value = record.get('date')
if not date_value:
continue # Skip records with missing date
Args:
sales_data_df: A pandas DataFrame containing a 'date' column.
if isinstance(date_value, str):
# Parse string with explicit format
dt = pd.to_datetime(date_value, format='mixed', errors='raise')
if dt.tz is None:
dt = dt.tz_localize('UTC') # Assign UTC timezone if none
dates.append(dt.to_pydatetime())
elif isinstance(date_value, datetime):
if date_value.tzinfo is None:
date_value = date_value.replace(tzinfo=timezone.utc)
dates.append(date_value)
else:
continue
Returns:
A tuple of timezone-aware start and end dates in UTC.
"""
if 'date' not in sales_data_df.columns:
raise ValueError("DataFrame does not contain a 'date' column.")
if not dates:
# Convert the 'date' column to datetime objects
sales_data_df['date'] = pd.to_datetime(sales_data_df['date'])
# Localize the naive datetime objects to UTC
sales_data_df['date'] = sales_data_df['date'].tz_localize('UTC')
# Find the minimum and maximum dates
start_date = sales_data_df['date'].min()
end_date = sales_data_df['date'].max()
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)
def _extract_sales_date_range(self, sales_data: List[Dict[str, Any]]) -> 'DateRange':
"""
Extract date range from sales data with proper date parsing
Args:
sales_data: List of sales records
Returns:
DateRange object with timezone-aware start and end dates
"""
if not sales_data:
raise ValueError("No sales data provided for date range extraction")
# Convert to DataFrame for easier processing
sales_df = pd.DataFrame(sales_data)
if 'date' not in sales_df.columns:
raise ValueError("Sales data does not contain a 'date' column")
# Convert dates to datetime with proper parsing
# This will use the improved date parsing from the data import service
sales_df['date'] = pd.to_datetime(sales_df['date'], utc=True, errors='coerce')
# Remove any rows with invalid dates
sales_df = sales_df.dropna(subset=['date'])
if len(sales_df) == 0:
raise ValueError("No valid dates found in sales data")
start_date = min(dates)
end_date = max(dates)
# Find the minimum and maximum dates
start_date = sales_df['date'].min()
end_date = sales_df['date'].max()
logger.info(f"Extracted sales date range: {start_date} to {end_date}")
return DateRange(start_date, end_date, DataSourceType.BAKERY_SALES)