Refactor datetime and timezone utils

This commit is contained in:
Urtzi Alfaro
2025-10-12 23:16:04 +02:00
parent 7556a00db7
commit 96ad5c6692
11 changed files with 731 additions and 362 deletions

View File

@@ -33,7 +33,7 @@ optuna.logging.set_verbosity(optuna.logging.WARNING)
from app.core.config import settings
from app.core import constants as const
from app.utils.timezone_utils import prepare_prophet_datetime
from app.utils.ml_datetime import prepare_prophet_datetime
from app.utils.file_utils import ChecksummedFile, calculate_file_checksum
from app.utils.distributed_lock import get_training_lock, LockAcquisitionError

View File

@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
import logging
from app.utils.timezone_utils import ensure_timezone_aware
from app.utils.ml_datetime import ensure_timezone_aware
logger = logging.getLogger(__name__)

View File

@@ -2,7 +2,7 @@
Training Service Utilities
"""
from .timezone_utils import (
from .ml_datetime import (
ensure_timezone_aware,
ensure_timezone_naive,
normalize_datetime_to_utc,

View File

@@ -1,10 +1,14 @@
"""
Timezone Utility Functions
Centralized timezone handling to ensure consistency across the training service
ML-Specific DateTime Utilities
DateTime utilities for machine learning operations, specifically for:
- Prophet forecasting model (requires timezone-naive datetimes)
- Pandas DataFrame datetime operations
- Time series data processing
"""
from datetime import datetime, timezone
from typing import Optional, Union
from typing import Union
import pandas as pd
import logging
@@ -61,15 +65,12 @@ def normalize_datetime_to_utc(dt: Union[datetime, pd.Timestamp]) -> datetime:
if dt is None:
return None
# Handle pandas Timestamp
if isinstance(dt, pd.Timestamp):
dt = dt.to_pydatetime()
# If naive, assume UTC
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
# If aware but not UTC, convert to UTC
return dt.astimezone(timezone.utc)
@@ -93,19 +94,15 @@ def normalize_dataframe_datetime_column(
logger.warning(f"Column {column} not found in dataframe")
return df
# Convert to datetime if not already
df[column] = pd.to_datetime(df[column])
if target_format == 'naive':
# Remove timezone if present
if df[column].dt.tz is not None:
df[column] = df[column].dt.tz_localize(None)
elif target_format == 'aware':
# Add UTC timezone if not present
if df[column].dt.tz is None:
df[column] = df[column].dt.tz_localize(timezone.utc)
else:
# Convert to UTC if different timezone
df[column] = df[column].dt.tz_convert(timezone.utc)
else:
raise ValueError(f"Invalid target_format: {target_format}. Must be 'naive' or 'aware'")
@@ -140,7 +137,6 @@ def safe_datetime_comparison(dt1: datetime, dt2: datetime) -> int:
Returns:
-1 if dt1 < dt2, 0 if equal, 1 if dt1 > dt2
"""
# Normalize both to UTC for comparison
dt1_utc = normalize_datetime_to_utc(dt1)
dt2_utc = normalize_datetime_to_utc(dt2)
@@ -176,9 +172,99 @@ def convert_timestamp_to_datetime(timestamp: Union[int, float, str]) -> datetime
dt = pd.to_datetime(timestamp)
return normalize_datetime_to_utc(dt)
# Check if milliseconds (typical JavaScript timestamp)
if timestamp > 1e10:
timestamp = timestamp / 1000
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
return dt
def align_dataframe_dates(
dfs: list[pd.DataFrame],
date_column: str = 'ds',
method: str = 'inner'
) -> list[pd.DataFrame]:
"""
Align multiple dataframes to have the same date range.
Args:
dfs: List of DataFrames to align
date_column: Name of the date column
method: 'inner' (intersection) or 'outer' (union)
Returns:
List of aligned DataFrames
"""
if not dfs:
return []
if len(dfs) == 1:
return dfs
all_dates = None
for df in dfs:
if date_column not in df.columns:
continue
dates = set(pd.to_datetime(df[date_column]).dt.date)
if all_dates is None:
all_dates = dates
else:
if method == 'inner':
all_dates = all_dates.intersection(dates)
elif method == 'outer':
all_dates = all_dates.union(dates)
aligned_dfs = []
for df in dfs:
if date_column not in df.columns:
aligned_dfs.append(df)
continue
df = df.copy()
df[date_column] = pd.to_datetime(df[date_column])
df['_date_only'] = df[date_column].dt.date
df = df[df['_date_only'].isin(all_dates)]
df = df.drop('_date_only', axis=1)
aligned_dfs.append(df)
return aligned_dfs
def fill_missing_dates(
df: pd.DataFrame,
date_column: str = 'ds',
freq: str = 'D',
fill_value: float = 0.0
) -> pd.DataFrame:
"""
Fill missing dates in a DataFrame with a specified frequency.
Args:
df: DataFrame with date column
date_column: Name of the date column
freq: Pandas frequency string ('D' for daily, 'H' for hourly, etc.)
fill_value: Value to fill for missing dates
Returns:
DataFrame with filled dates
"""
df = df.copy()
df[date_column] = pd.to_datetime(df[date_column])
df = df.set_index(date_column)
full_range = pd.date_range(
start=df.index.min(),
end=df.index.max(),
freq=freq
)
df = df.reindex(full_range, fill_value=fill_value)
df = df.reset_index()
df = df.rename(columns={'index': date_column})
return df