Refactor datetime and timezone utils

2025-10-12 23:16:04 +02:00
parent 7556a00db7
commit 96ad5c6692
11 changed files with 731 additions and 362 deletions
--- a/services/training/app/ml/prophet_manager.py
+++ b/services/training/app/ml/prophet_manager.py
@@ -33,7 +33,7 @@ optuna.logging.set_verbosity(optuna.logging.WARNING)

 from app.core.config import settings
 from app.core import constants as const
-from app.utils.timezone_utils import prepare_prophet_datetime
+from app.utils.ml_datetime import prepare_prophet_datetime
 from app.utils.file_utils import ChecksummedFile, calculate_file_checksum
 from app.utils.distributed_lock import get_training_lock, LockAcquisitionError

--- a/services/training/app/services/date_alignment_service.py
+++ b/services/training/app/services/date_alignment_service.py
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 from enum import Enum
 import logging
-from app.utils.timezone_utils import ensure_timezone_aware
+from app.utils.ml_datetime import ensure_timezone_aware

 logger = logging.getLogger(__name__)

--- a/services/training/app/utils/init.py
+++ b/services/training/app/utils/init.py
@@ -2,7 +2,7 @@
 Training Service Utilities
 """

-from .timezone_utils import (
+from .ml_datetime import (
    ensure_timezone_aware,
    ensure_timezone_naive,
    normalize_datetime_to_utc,
--- a/services/training/app/utils/timezone_utils.py
+++ b/services/training/app/utils/timezone_utils.py
@@ -1,10 +1,14 @@
 """
-Timezone Utility Functions
-Centralized timezone handling to ensure consistency across the training service
+ML-Specific DateTime Utilities
+
+DateTime utilities for machine learning operations, specifically for:
+- Prophet forecasting model (requires timezone-naive datetimes)
+- Pandas DataFrame datetime operations
+- Time series data processing
 """

 from datetime import datetime, timezone
-from typing import Optional, Union
+from typing import Union
 import pandas as pd
 import logging

@@ -61,15 +65,12 @@ def normalize_datetime_to_utc(dt: Union[datetime, pd.Timestamp]) -> datetime:
    if dt is None:
        return None

-    # Handle pandas Timestamp
    if isinstance(dt, pd.Timestamp):
        dt = dt.to_pydatetime()

-    # If naive, assume UTC
    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)

-    # If aware but not UTC, convert to UTC
    return dt.astimezone(timezone.utc)


@@ -93,19 +94,15 @@ def normalize_dataframe_datetime_column(
        logger.warning(f"Column {column} not found in dataframe")
        return df

-    # Convert to datetime if not already
    df[column] = pd.to_datetime(df[column])

    if target_format == 'naive':
-        # Remove timezone if present
        if df[column].dt.tz is not None:
            df[column] = df[column].dt.tz_localize(None)
    elif target_format == 'aware':
-        # Add UTC timezone if not present
        if df[column].dt.tz is None:
            df[column] = df[column].dt.tz_localize(timezone.utc)
        else:
-            # Convert to UTC if different timezone
            df[column] = df[column].dt.tz_convert(timezone.utc)
    else:
        raise ValueError(f"Invalid target_format: {target_format}. Must be 'naive' or 'aware'")
@@ -140,7 +137,6 @@ def safe_datetime_comparison(dt1: datetime, dt2: datetime) -> int:
    Returns:
        -1 if dt1 < dt2, 0 if equal, 1 if dt1 > dt2
    """
-    # Normalize both to UTC for comparison
    dt1_utc = normalize_datetime_to_utc(dt1)
    dt2_utc = normalize_datetime_to_utc(dt2)

@@ -176,9 +172,99 @@ def convert_timestamp_to_datetime(timestamp: Union[int, float, str]) -> datetime
        dt = pd.to_datetime(timestamp)
        return normalize_datetime_to_utc(dt)

-    # Check if milliseconds (typical JavaScript timestamp)
    if timestamp > 1e10:
        timestamp = timestamp / 1000

    dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
    return dt
+
+
+def align_dataframe_dates(
+    dfs: list[pd.DataFrame],
+    date_column: str = 'ds',
+    method: str = 'inner'
+) -> list[pd.DataFrame]:
+    """
+    Align multiple dataframes to have the same date range.
+
+    Args:
+        dfs: List of DataFrames to align
+        date_column: Name of the date column
+        method: 'inner' (intersection) or 'outer' (union)
+
+    Returns:
+        List of aligned DataFrames
+    """
+    if not dfs:
+        return []
+
+    if len(dfs) == 1:
+        return dfs
+
+    all_dates = None
+
+    for df in dfs:
+        if date_column not in df.columns:
+            continue
+
+        dates = set(pd.to_datetime(df[date_column]).dt.date)
+
+        if all_dates is None:
+            all_dates = dates
+        else:
+            if method == 'inner':
+                all_dates = all_dates.intersection(dates)
+            elif method == 'outer':
+                all_dates = all_dates.union(dates)
+
+    aligned_dfs = []
+    for df in dfs:
+        if date_column not in df.columns:
+            aligned_dfs.append(df)
+            continue
+
+        df = df.copy()
+        df[date_column] = pd.to_datetime(df[date_column])
+        df['_date_only'] = df[date_column].dt.date
+        df = df[df['_date_only'].isin(all_dates)]
+        df = df.drop('_date_only', axis=1)
+        aligned_dfs.append(df)
+
+    return aligned_dfs
+
+
+def fill_missing_dates(
+    df: pd.DataFrame,
+    date_column: str = 'ds',
+    freq: str = 'D',
+    fill_value: float = 0.0
+) -> pd.DataFrame:
+    """
+    Fill missing dates in a DataFrame with a specified frequency.
+
+    Args:
+        df: DataFrame with date column
+        date_column: Name of the date column
+        freq: Pandas frequency string ('D' for daily, 'H' for hourly, etc.)
+        fill_value: Value to fill for missing dates
+
+    Returns:
+        DataFrame with filled dates
+    """
+    df = df.copy()
+    df[date_column] = pd.to_datetime(df[date_column])
+
+    df = df.set_index(date_column)
+
+    full_range = pd.date_range(
+        start=df.index.min(),
+        end=df.index.max(),
+        freq=freq
+    )
+
+    df = df.reindex(full_range, fill_value=fill_value)
+
+    df = df.reset_index()
+    df = df.rename(columns={'index': date_column})
+
+    return df