""" ML-Specific DateTime Utilities DateTime utilities for machine learning operations, specifically for: - Prophet forecasting model (requires timezone-naive datetimes) - Pandas DataFrame datetime operations - Time series data processing """ from datetime import datetime, timezone from typing import Union import pandas as pd import logging logger = logging.getLogger(__name__) def ensure_timezone_aware(dt: datetime, default_tz=timezone.utc) -> datetime: """ Ensure a datetime is timezone-aware. Args: dt: Datetime to check default_tz: Timezone to apply if datetime is naive (default: UTC) Returns: Timezone-aware datetime """ if dt is None: return None if dt.tzinfo is None: return dt.replace(tzinfo=default_tz) return dt def ensure_timezone_naive(dt: datetime) -> datetime: """ Remove timezone information from a datetime. Args: dt: Datetime to process Returns: Timezone-naive datetime """ if dt is None: return None if dt.tzinfo is not None: return dt.replace(tzinfo=None) return dt def normalize_datetime_to_utc(dt: Union[datetime, pd.Timestamp]) -> datetime: """ Normalize any datetime to UTC timezone-aware datetime. Args: dt: Datetime or pandas Timestamp to normalize Returns: UTC timezone-aware datetime """ if dt is None: return None if isinstance(dt, pd.Timestamp): dt = dt.to_pydatetime() if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) def normalize_dataframe_datetime_column( df: pd.DataFrame, column: str, target_format: str = 'naive' ) -> pd.DataFrame: """ Normalize a datetime column in a dataframe to consistent format. Args: df: DataFrame to process column: Name of datetime column target_format: 'naive' or 'aware' (UTC) Returns: DataFrame with normalized datetime column """ if column not in df.columns: logger.warning(f"Column {column} not found in dataframe") return df df[column] = pd.to_datetime(df[column]) if target_format == 'naive': if df[column].dt.tz is not None: df[column] = df[column].dt.tz_localize(None) elif target_format == 'aware': if df[column].dt.tz is None: df[column] = df[column].dt.tz_localize(timezone.utc) else: df[column] = df[column].dt.tz_convert(timezone.utc) else: raise ValueError(f"Invalid target_format: {target_format}. Must be 'naive' or 'aware'") return df def prepare_prophet_datetime(df: pd.DataFrame, datetime_col: str = 'ds') -> pd.DataFrame: """ Prepare datetime column for Prophet (requires timezone-naive datetimes). Args: df: DataFrame with datetime column datetime_col: Name of datetime column (default: 'ds') Returns: DataFrame with Prophet-compatible datetime column """ df = df.copy() df = normalize_dataframe_datetime_column(df, datetime_col, target_format='naive') return df def safe_datetime_comparison(dt1: datetime, dt2: datetime) -> int: """ Safely compare two datetimes, handling timezone mismatches. Args: dt1: First datetime dt2: Second datetime Returns: -1 if dt1 < dt2, 0 if equal, 1 if dt1 > dt2 """ dt1_utc = normalize_datetime_to_utc(dt1) dt2_utc = normalize_datetime_to_utc(dt2) if dt1_utc < dt2_utc: return -1 elif dt1_utc > dt2_utc: return 1 else: return 0 def get_current_utc() -> datetime: """ Get current datetime in UTC with timezone awareness. Returns: Current UTC datetime """ return datetime.now(timezone.utc) def convert_timestamp_to_datetime(timestamp: Union[int, float, str]) -> datetime: """ Convert various timestamp formats to datetime. Args: timestamp: Unix timestamp (seconds or milliseconds) or ISO string Returns: UTC timezone-aware datetime """ if isinstance(timestamp, str): dt = pd.to_datetime(timestamp) return normalize_datetime_to_utc(dt) if timestamp > 1e10: timestamp = timestamp / 1000 dt = datetime.fromtimestamp(timestamp, tz=timezone.utc) return dt def align_dataframe_dates( dfs: list[pd.DataFrame], date_column: str = 'ds', method: str = 'inner' ) -> list[pd.DataFrame]: """ Align multiple dataframes to have the same date range. Args: dfs: List of DataFrames to align date_column: Name of the date column method: 'inner' (intersection) or 'outer' (union) Returns: List of aligned DataFrames """ if not dfs: return [] if len(dfs) == 1: return dfs all_dates = None for df in dfs: if date_column not in df.columns: continue dates = set(pd.to_datetime(df[date_column]).dt.date) if all_dates is None: all_dates = dates else: if method == 'inner': all_dates = all_dates.intersection(dates) elif method == 'outer': all_dates = all_dates.union(dates) aligned_dfs = [] for df in dfs: if date_column not in df.columns: aligned_dfs.append(df) continue df = df.copy() df[date_column] = pd.to_datetime(df[date_column]) df['_date_only'] = df[date_column].dt.date df = df[df['_date_only'].isin(all_dates)] df = df.drop('_date_only', axis=1) aligned_dfs.append(df) return aligned_dfs def fill_missing_dates( df: pd.DataFrame, date_column: str = 'ds', freq: str = 'D', fill_value: float = 0.0 ) -> pd.DataFrame: """ Fill missing dates in a DataFrame with a specified frequency. Args: df: DataFrame with date column date_column: Name of the date column freq: Pandas frequency string ('D' for daily, 'H' for hourly, etc.) fill_value: Value to fill for missing dates Returns: DataFrame with filled dates """ df = df.copy() df[date_column] = pd.to_datetime(df[date_column]) df = df.set_index(date_column) full_range = pd.date_range( start=df.index.min(), end=df.index.max(), freq=freq ) df = df.reindex(full_range, fill_value=fill_value) df = df.reset_index() df = df.rename(columns={'index': date_column}) return df