Files
bakery-ia/services/training/app/ml/calendar_features.py
2025-11-02 20:24:44 +01:00

308 lines
10 KiB
Python

"""
Calendar-based Feature Engineering
Hyperlocal school calendar and event features for demand forecasting
"""
import pandas as pd
import structlog
from typing import Dict, List, Any, Optional
from datetime import datetime, date, time, timedelta
from shared.clients.external_client import ExternalServiceClient
logger = structlog.get_logger()
class CalendarFeatureEngine:
"""
Generates features based on school calendars and local events
for hyperlocal demand forecasting enhancement
"""
def __init__(self, external_client: ExternalServiceClient):
self.external_client = external_client
self.calendar_cache = {} # Cache calendar data to avoid repeated API calls
async def get_calendar_for_tenant(
self,
tenant_id: str,
city_id: Optional[str] = "madrid"
) -> Optional[Dict[str, Any]]:
"""
Get the assigned school calendar for a tenant
If tenant has no assignment, returns None
"""
try:
# Check cache first
cache_key = f"tenant_{tenant_id}_calendar"
if cache_key in self.calendar_cache:
logger.debug("Using cached calendar", tenant_id=tenant_id)
return self.calendar_cache[cache_key]
# Get tenant location context
context = await self.external_client.get_tenant_location_context(tenant_id)
if not context or not context.get("calendar"):
logger.info(
"No calendar assigned to tenant, using default if available",
tenant_id=tenant_id
)
return None
calendar = context["calendar"]
self.calendar_cache[cache_key] = calendar
logger.info(
"Retrieved calendar for tenant",
tenant_id=tenant_id,
calendar_name=calendar.get("calendar_name")
)
return calendar
except Exception as e:
logger.error(
"Error retrieving calendar for tenant",
tenant_id=tenant_id,
error=str(e)
)
return None
def _is_date_in_holiday_period(
self,
check_date: date,
holiday_periods: List[Dict[str, Any]]
) -> tuple[bool, Optional[str]]:
"""
Check if a date falls within any holiday period
Returns:
(is_holiday, holiday_name)
"""
for period in holiday_periods:
start = datetime.strptime(period["start_date"], "%Y-%m-%d").date()
end = datetime.strptime(period["end_date"], "%Y-%m-%d").date()
if start <= check_date <= end:
return True, period["name"]
return False, None
def _is_school_hours_active(
self,
check_datetime: datetime,
school_hours: Dict[str, Any]
) -> bool:
"""
Check if datetime falls during school operating hours
Args:
check_datetime: DateTime to check
school_hours: School hours configuration dict
Returns:
True if during school hours, False otherwise
"""
# Only check weekdays
if check_datetime.weekday() >= 5: # Saturday=5, Sunday=6
return False
check_time = check_datetime.time()
# Morning session
morning_start = datetime.strptime(
school_hours["morning_start"], "%H:%M"
).time()
morning_end = datetime.strptime(
school_hours["morning_end"], "%H:%M"
).time()
if morning_start <= check_time <= morning_end:
return True
# Afternoon session (if applicable)
if school_hours.get("has_afternoon_session", False):
afternoon_start = datetime.strptime(
school_hours["afternoon_start"], "%H:%M"
).time()
afternoon_end = datetime.strptime(
school_hours["afternoon_end"], "%H:%M"
).time()
if afternoon_start <= check_time <= afternoon_end:
return True
return False
def _calculate_school_proximity_intensity(
self,
check_datetime: datetime,
school_hours: Dict[str, Any]
) -> float:
"""
Calculate intensity of school-related foot traffic
Peaks during drop-off and pick-up times
Returns:
Float between 0.0 (no impact) and 1.0 (peak impact)
"""
# Only weekdays
if check_datetime.weekday() >= 5:
return 0.0
check_time = check_datetime.time()
# Define peak windows (30 minutes before and after school start/end)
morning_start = datetime.strptime(
school_hours["morning_start"], "%H:%M"
).time()
morning_end = datetime.strptime(
school_hours["morning_end"], "%H:%M"
).time()
# Morning drop-off peak (30 min before to 15 min after start)
drop_off_start = (
datetime.combine(date.today(), morning_start) - timedelta(minutes=30)
).time()
drop_off_end = (
datetime.combine(date.today(), morning_start) + timedelta(minutes=15)
).time()
if drop_off_start <= check_time <= drop_off_end:
return 1.0 # Peak morning traffic
# Morning pick-up peak (15 min before to 30 min after end)
pickup_start = (
datetime.combine(date.today(), morning_end) - timedelta(minutes=15)
).time()
pickup_end = (
datetime.combine(date.today(), morning_end) + timedelta(minutes=30)
).time()
if pickup_start <= check_time <= pickup_end:
return 1.0 # Peak afternoon traffic
# During school hours (moderate impact)
if morning_start <= check_time <= morning_end:
return 0.3
# Afternoon session if applicable
if school_hours.get("has_afternoon_session", False):
afternoon_start = datetime.strptime(
school_hours["afternoon_start"], "%H:%M"
).time()
afternoon_end = datetime.strptime(
school_hours["afternoon_end"], "%H:%M"
).time()
if afternoon_start <= check_time <= afternoon_end:
return 0.3
return 0.0
async def add_calendar_features(
self,
df: pd.DataFrame,
tenant_id: str,
date_column: str = "date"
) -> pd.DataFrame:
"""
Add calendar-based features to dataframe
Features added:
- is_school_holiday: Binary (1/0)
- school_holiday_name: String (name of holiday or None)
- school_hours_active: Binary (1/0) - if during school operating hours
- school_proximity_intensity: Float (0.0-1.0) - peak during drop-off/pick-up
Args:
df: DataFrame with date/datetime column
tenant_id: Tenant ID to get calendar assignment
date_column: Name of date column
Returns:
DataFrame with added calendar features
"""
try:
logger.info(
"Adding calendar-based features",
tenant_id=tenant_id,
rows=len(df)
)
# Get calendar for tenant
calendar = await self.get_calendar_for_tenant(tenant_id)
if not calendar:
logger.warning(
"No calendar available, using fallback features",
tenant_id=tenant_id
)
# Add default features (all zeros)
df["is_school_holiday"] = 0
df["school_holiday_name"] = None
df["school_hours_active"] = 0
df["school_proximity_intensity"] = 0.0
return df
holiday_periods = calendar.get("holiday_periods", [])
school_hours = calendar.get("school_hours", {})
# Initialize feature columns
school_holidays = []
holiday_names = []
hours_active = []
proximity_intensity = []
# Process each row
for idx, row in df.iterrows():
row_date = pd.to_datetime(row[date_column])
# Check if holiday
is_holiday, holiday_name = self._is_date_in_holiday_period(
row_date.date(),
holiday_periods
)
school_holidays.append(1 if is_holiday else 0)
holiday_names.append(holiday_name)
# Check if during school hours (requires time component)
if hasattr(row_date, 'hour'): # Has time component
hours_active.append(
1 if self._is_school_hours_active(row_date, school_hours) else 0
)
proximity_intensity.append(
self._calculate_school_proximity_intensity(row_date, school_hours)
)
else:
# Date only, no time component
hours_active.append(0)
proximity_intensity.append(0.0)
# Add features to dataframe
df["is_school_holiday"] = school_holidays
df["school_holiday_name"] = holiday_names
df["school_hours_active"] = hours_active
df["school_proximity_intensity"] = proximity_intensity
logger.info(
"Calendar features added successfully",
tenant_id=tenant_id,
holiday_periods_count=len(holiday_periods),
holidays_found=sum(school_holidays)
)
return df
except Exception as e:
logger.error(
"Error adding calendar features",
tenant_id=tenant_id,
error=str(e)
)
# Return df with default features on error
df["is_school_holiday"] = 0
df["school_holiday_name"] = None
df["school_hours_active"] = 0
df["school_proximity_intensity"] = 0.0
return df