2025-10-09 14:11:02 +02:00
|
|
|
"""
|
2025-10-12 23:16:04 +02:00
|
|
|
ML-Specific DateTime Utilities
|
|
|
|
|
|
|
|
|
|
DateTime utilities for machine learning operations, specifically for:
|
|
|
|
|
- Prophet forecasting model (requires timezone-naive datetimes)
|
|
|
|
|
- Pandas DataFrame datetime operations
|
|
|
|
|
- Time series data processing
|
2025-10-09 14:11:02 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from datetime import datetime, timezone
|
2025-10-12 23:16:04 +02:00
|
|
|
from typing import Union
|
2025-10-09 14:11:02 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_timezone_aware(dt: datetime, default_tz=timezone.utc) -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Ensure a datetime is timezone-aware.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
dt: Datetime to check
|
|
|
|
|
default_tz: Timezone to apply if datetime is naive (default: UTC)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Timezone-aware datetime
|
|
|
|
|
"""
|
|
|
|
|
if dt is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
if dt.tzinfo is None:
|
|
|
|
|
return dt.replace(tzinfo=default_tz)
|
|
|
|
|
return dt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_timezone_naive(dt: datetime) -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Remove timezone information from a datetime.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
dt: Datetime to process
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Timezone-naive datetime
|
|
|
|
|
"""
|
|
|
|
|
if dt is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
if dt.tzinfo is not None:
|
|
|
|
|
return dt.replace(tzinfo=None)
|
|
|
|
|
return dt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_datetime_to_utc(dt: Union[datetime, pd.Timestamp]) -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Normalize any datetime to UTC timezone-aware datetime.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
dt: Datetime or pandas Timestamp to normalize
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
UTC timezone-aware datetime
|
|
|
|
|
"""
|
|
|
|
|
if dt is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
if isinstance(dt, pd.Timestamp):
|
|
|
|
|
dt = dt.to_pydatetime()
|
|
|
|
|
|
|
|
|
|
if dt.tzinfo is None:
|
|
|
|
|
return dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
|
|
|
|
|
return dt.astimezone(timezone.utc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_dataframe_datetime_column(
|
|
|
|
|
df: pd.DataFrame,
|
|
|
|
|
column: str,
|
|
|
|
|
target_format: str = 'naive'
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
"""
|
|
|
|
|
Normalize a datetime column in a dataframe to consistent format.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
df: DataFrame to process
|
|
|
|
|
column: Name of datetime column
|
|
|
|
|
target_format: 'naive' or 'aware' (UTC)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
DataFrame with normalized datetime column
|
|
|
|
|
"""
|
|
|
|
|
if column not in df.columns:
|
|
|
|
|
logger.warning(f"Column {column} not found in dataframe")
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
df[column] = pd.to_datetime(df[column])
|
|
|
|
|
|
|
|
|
|
if target_format == 'naive':
|
|
|
|
|
if df[column].dt.tz is not None:
|
|
|
|
|
df[column] = df[column].dt.tz_localize(None)
|
|
|
|
|
elif target_format == 'aware':
|
|
|
|
|
if df[column].dt.tz is None:
|
|
|
|
|
df[column] = df[column].dt.tz_localize(timezone.utc)
|
|
|
|
|
else:
|
|
|
|
|
df[column] = df[column].dt.tz_convert(timezone.utc)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"Invalid target_format: {target_format}. Must be 'naive' or 'aware'")
|
|
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_prophet_datetime(df: pd.DataFrame, datetime_col: str = 'ds') -> pd.DataFrame:
|
|
|
|
|
"""
|
|
|
|
|
Prepare datetime column for Prophet (requires timezone-naive datetimes).
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
df: DataFrame with datetime column
|
|
|
|
|
datetime_col: Name of datetime column (default: 'ds')
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
DataFrame with Prophet-compatible datetime column
|
|
|
|
|
"""
|
|
|
|
|
df = df.copy()
|
|
|
|
|
df = normalize_dataframe_datetime_column(df, datetime_col, target_format='naive')
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def safe_datetime_comparison(dt1: datetime, dt2: datetime) -> int:
|
|
|
|
|
"""
|
|
|
|
|
Safely compare two datetimes, handling timezone mismatches.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
dt1: First datetime
|
|
|
|
|
dt2: Second datetime
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
-1 if dt1 < dt2, 0 if equal, 1 if dt1 > dt2
|
|
|
|
|
"""
|
|
|
|
|
dt1_utc = normalize_datetime_to_utc(dt1)
|
|
|
|
|
dt2_utc = normalize_datetime_to_utc(dt2)
|
|
|
|
|
|
|
|
|
|
if dt1_utc < dt2_utc:
|
|
|
|
|
return -1
|
|
|
|
|
elif dt1_utc > dt2_utc:
|
|
|
|
|
return 1
|
|
|
|
|
else:
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_current_utc() -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Get current datetime in UTC with timezone awareness.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Current UTC datetime
|
|
|
|
|
"""
|
|
|
|
|
return datetime.now(timezone.utc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_timestamp_to_datetime(timestamp: Union[int, float, str]) -> datetime:
|
|
|
|
|
"""
|
|
|
|
|
Convert various timestamp formats to datetime.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
timestamp: Unix timestamp (seconds or milliseconds) or ISO string
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
UTC timezone-aware datetime
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(timestamp, str):
|
|
|
|
|
dt = pd.to_datetime(timestamp)
|
|
|
|
|
return normalize_datetime_to_utc(dt)
|
|
|
|
|
|
|
|
|
|
if timestamp > 1e10:
|
|
|
|
|
timestamp = timestamp / 1000
|
|
|
|
|
|
|
|
|
|
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
|
|
|
|
return dt
|
2025-10-12 23:16:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def align_dataframe_dates(
|
|
|
|
|
dfs: list[pd.DataFrame],
|
|
|
|
|
date_column: str = 'ds',
|
|
|
|
|
method: str = 'inner'
|
|
|
|
|
) -> list[pd.DataFrame]:
|
|
|
|
|
"""
|
|
|
|
|
Align multiple dataframes to have the same date range.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
dfs: List of DataFrames to align
|
|
|
|
|
date_column: Name of the date column
|
|
|
|
|
method: 'inner' (intersection) or 'outer' (union)
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of aligned DataFrames
|
|
|
|
|
"""
|
|
|
|
|
if not dfs:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
if len(dfs) == 1:
|
|
|
|
|
return dfs
|
|
|
|
|
|
|
|
|
|
all_dates = None
|
|
|
|
|
|
|
|
|
|
for df in dfs:
|
|
|
|
|
if date_column not in df.columns:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
dates = set(pd.to_datetime(df[date_column]).dt.date)
|
|
|
|
|
|
|
|
|
|
if all_dates is None:
|
|
|
|
|
all_dates = dates
|
|
|
|
|
else:
|
|
|
|
|
if method == 'inner':
|
|
|
|
|
all_dates = all_dates.intersection(dates)
|
|
|
|
|
elif method == 'outer':
|
|
|
|
|
all_dates = all_dates.union(dates)
|
|
|
|
|
|
|
|
|
|
aligned_dfs = []
|
|
|
|
|
for df in dfs:
|
|
|
|
|
if date_column not in df.columns:
|
|
|
|
|
aligned_dfs.append(df)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
df = df.copy()
|
|
|
|
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
|
|
|
df['_date_only'] = df[date_column].dt.date
|
|
|
|
|
df = df[df['_date_only'].isin(all_dates)]
|
|
|
|
|
df = df.drop('_date_only', axis=1)
|
|
|
|
|
aligned_dfs.append(df)
|
|
|
|
|
|
|
|
|
|
return aligned_dfs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fill_missing_dates(
|
|
|
|
|
df: pd.DataFrame,
|
|
|
|
|
date_column: str = 'ds',
|
|
|
|
|
freq: str = 'D',
|
|
|
|
|
fill_value: float = 0.0
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
"""
|
|
|
|
|
Fill missing dates in a DataFrame with a specified frequency.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
df: DataFrame with date column
|
|
|
|
|
date_column: Name of the date column
|
|
|
|
|
freq: Pandas frequency string ('D' for daily, 'H' for hourly, etc.)
|
|
|
|
|
fill_value: Value to fill for missing dates
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
DataFrame with filled dates
|
|
|
|
|
"""
|
|
|
|
|
df = df.copy()
|
|
|
|
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
|
|
|
|
|
|
|
|
df = df.set_index(date_column)
|
|
|
|
|
|
|
|
|
|
full_range = pd.date_range(
|
|
|
|
|
start=df.index.min(),
|
|
|
|
|
end=df.index.max(),
|
|
|
|
|
freq=freq
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
df = df.reindex(full_range, fill_value=fill_value)
|
|
|
|
|
|
|
|
|
|
df = df.reset_index()
|
|
|
|
|
df = df.rename(columns={'index': date_column})
|
|
|
|
|
|
|
|
|
|
return df
|