REFACTOR production scheduler

This commit is contained in:
Urtzi Alfaro
2025-10-09 18:01:24 +02:00
parent 3c689b4f98
commit b420af32c5
13 changed files with 4046 additions and 6 deletions

View File

@@ -0,0 +1,258 @@
# shared/monitoring/scheduler_metrics.py
"""
Scheduler Metrics - Prometheus metrics for production and procurement schedulers
Provides comprehensive metrics for monitoring automated daily planning:
- Scheduler execution success/failure rates
- Tenant processing times
- Cache hit rates for forecasts
- Plan generation statistics
"""
from prometheus_client import Counter, Histogram, Gauge, Info
import structlog
logger = structlog.get_logger()
# ================================================================
# PRODUCTION SCHEDULER METRICS
# ================================================================
production_schedules_generated_total = Counter(
'production_schedules_generated_total',
'Total number of production schedules generated',
['tenant_id', 'status'] # status: success, failure
)
production_schedule_generation_duration_seconds = Histogram(
'production_schedule_generation_duration_seconds',
'Time taken to generate production schedule per tenant',
['tenant_id'],
buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds
)
production_tenants_processed_total = Counter(
'production_tenants_processed_total',
'Total number of tenants processed by production scheduler',
['status'] # status: success, failure, timeout
)
production_batches_created_total = Counter(
'production_batches_created_total',
'Total number of production batches created',
['tenant_id']
)
production_scheduler_runs_total = Counter(
'production_scheduler_runs_total',
'Total number of production scheduler executions',
['trigger'] # trigger: scheduled, manual, test
)
production_scheduler_errors_total = Counter(
'production_scheduler_errors_total',
'Total number of production scheduler errors',
['error_type']
)
# ================================================================
# PROCUREMENT SCHEDULER METRICS
# ================================================================
procurement_plans_generated_total = Counter(
'procurement_plans_generated_total',
'Total number of procurement plans generated',
['tenant_id', 'status'] # status: success, failure
)
procurement_plan_generation_duration_seconds = Histogram(
'procurement_plan_generation_duration_seconds',
'Time taken to generate procurement plan per tenant',
['tenant_id'],
buckets=[1, 5, 10, 30, 60, 120, 180, 300]
)
procurement_tenants_processed_total = Counter(
'procurement_tenants_processed_total',
'Total number of tenants processed by procurement scheduler',
['status'] # status: success, failure, timeout
)
procurement_requirements_created_total = Counter(
'procurement_requirements_created_total',
'Total number of procurement requirements created',
['tenant_id', 'priority'] # priority: critical, high, medium, low
)
procurement_scheduler_runs_total = Counter(
'procurement_scheduler_runs_total',
'Total number of procurement scheduler executions',
['trigger'] # trigger: scheduled, manual, test
)
procurement_plan_rejections_total = Counter(
'procurement_plan_rejections_total',
'Total number of procurement plans rejected',
['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false
)
procurement_plans_by_status = Gauge(
'procurement_plans_by_status',
'Number of procurement plans by status',
['tenant_id', 'status']
)
# ================================================================
# FORECAST CACHING METRICS
# ================================================================
forecast_cache_hits_total = Counter(
'forecast_cache_hits_total',
'Total number of forecast cache hits',
['tenant_id']
)
forecast_cache_misses_total = Counter(
'forecast_cache_misses_total',
'Total number of forecast cache misses',
['tenant_id']
)
forecast_cache_hit_rate = Gauge(
'forecast_cache_hit_rate',
'Forecast cache hit rate percentage (0-100)',
['tenant_id']
)
forecast_cache_entries_total = Gauge(
'forecast_cache_entries_total',
'Total number of entries in forecast cache',
['cache_type'] # cache_type: single, batch
)
forecast_cache_invalidations_total = Counter(
'forecast_cache_invalidations_total',
'Total number of forecast cache invalidations',
['tenant_id', 'reason'] # reason: model_retrain, manual, expiry
)
# ================================================================
# GENERAL SCHEDULER HEALTH METRICS
# ================================================================
scheduler_health_status = Gauge(
'scheduler_health_status',
'Scheduler health status (1=healthy, 0=unhealthy)',
['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup
)
scheduler_last_run_timestamp = Gauge(
'scheduler_last_run_timestamp',
'Unix timestamp of last scheduler run',
['service', 'scheduler_type']
)
scheduler_next_run_timestamp = Gauge(
'scheduler_next_run_timestamp',
'Unix timestamp of next scheduled run',
['service', 'scheduler_type']
)
tenant_processing_timeout_total = Counter(
'tenant_processing_timeout_total',
'Total number of tenant processing timeouts',
['service', 'tenant_id'] # service: production, procurement
)
# ================================================================
# HELPER FUNCTIONS FOR METRICS
# ================================================================
class SchedulerMetricsCollector:
"""Helper class for collecting scheduler metrics"""
@staticmethod
def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int):
"""Record production schedule generation"""
status = 'success' if success else 'failure'
production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc()
production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
if success:
production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created)
@staticmethod
def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int):
"""Record procurement plan generation"""
status = 'success' if success else 'failure'
procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc()
procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
if success:
procurement_requirements_created_total.labels(
tenant_id=tenant_id,
priority='medium' # Default, should be updated with actual priority
).inc(requirements_count)
@staticmethod
def record_scheduler_run(service: str, trigger: str = 'scheduled'):
"""Record scheduler execution"""
if service == 'production':
production_scheduler_runs_total.labels(trigger=trigger).inc()
elif service == 'procurement':
procurement_scheduler_runs_total.labels(trigger=trigger).inc()
@staticmethod
def record_tenant_processing(service: str, status: str):
"""Record tenant processing result"""
if service == 'production':
production_tenants_processed_total.labels(status=status).inc()
elif service == 'procurement':
procurement_tenants_processed_total.labels(status=status).inc()
@staticmethod
def record_forecast_cache_lookup(tenant_id: str, hit: bool):
"""Record forecast cache lookup"""
if hit:
forecast_cache_hits_total.labels(tenant_id=tenant_id).inc()
else:
forecast_cache_misses_total.labels(tenant_id=tenant_id).inc()
@staticmethod
def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float):
"""Update forecast cache hit rate"""
forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent)
@staticmethod
def record_plan_rejection(tenant_id: str, auto_regenerated: bool):
"""Record procurement plan rejection"""
procurement_plan_rejections_total.labels(
tenant_id=tenant_id,
auto_regenerated='true' if auto_regenerated else 'false'
).inc()
@staticmethod
def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool):
"""Update scheduler health status"""
scheduler_health_status.labels(
service=service,
scheduler_type=scheduler_type
).set(1 if is_healthy else 0)
@staticmethod
def record_timeout(service: str, tenant_id: str):
"""Record tenant processing timeout"""
tenant_processing_timeout_total.labels(
service=service,
tenant_id=tenant_id
).inc()
# Global metrics collector instance
metrics_collector = SchedulerMetricsCollector()
def get_scheduler_metrics_collector() -> SchedulerMetricsCollector:
"""Get global scheduler metrics collector"""
return metrics_collector

View File

@@ -0,0 +1,276 @@
# shared/utils/timezone_helper.py
"""
Timezone Utility Helper for Bakery Management System
Provides timezone-aware date/time utilities for accurate scheduling across
different geographic locations. All schedulers should use these utilities
to ensure consistent behavior.
"""
from datetime import datetime, date, time
from typing import Optional
from zoneinfo import ZoneInfo
import structlog
logger = structlog.get_logger()
class TimezoneHelper:
"""Helper class for timezone-aware operations"""
DEFAULT_TIMEZONE = "Europe/Madrid"
VALID_TIMEZONES = {
"Europe/Madrid", "Europe/London", "Europe/Paris", "Europe/Berlin",
"America/New_York", "America/Chicago", "America/Los_Angeles",
"Asia/Tokyo", "Asia/Shanghai", "Australia/Sydney",
"UTC"
}
@classmethod
def get_current_date_in_timezone(cls, timezone_str: str) -> date:
"""
Get current date in specified timezone
Args:
timezone_str: IANA timezone string (e.g., "Europe/Madrid")
Returns:
Current date in the specified timezone
"""
try:
tz = ZoneInfo(timezone_str)
return datetime.now(tz).date()
except Exception as e:
logger.warning(f"Invalid timezone {timezone_str}, using default",
error=str(e))
return datetime.now(ZoneInfo(cls.DEFAULT_TIMEZONE)).date()
@classmethod
def get_current_datetime_in_timezone(cls, timezone_str: str) -> datetime:
"""
Get current datetime in specified timezone
Args:
timezone_str: IANA timezone string
Returns:
Current datetime in the specified timezone
"""
try:
tz = ZoneInfo(timezone_str)
return datetime.now(tz)
except Exception as e:
logger.warning(f"Invalid timezone {timezone_str}, using default",
error=str(e))
return datetime.now(ZoneInfo(cls.DEFAULT_TIMEZONE))
@classmethod
def combine_date_time_in_timezone(
cls,
target_date: date,
target_time: time,
timezone_str: str
) -> datetime:
"""
Combine date and time in specified timezone
Args:
target_date: Date component
target_time: Time component
timezone_str: IANA timezone string
Returns:
Datetime combining date and time in specified timezone
"""
try:
tz = ZoneInfo(timezone_str)
return datetime.combine(target_date, target_time, tzinfo=tz)
except Exception as e:
logger.warning(f"Invalid timezone {timezone_str}, using default",
error=str(e))
tz = ZoneInfo(cls.DEFAULT_TIMEZONE)
return datetime.combine(target_date, target_time, tzinfo=tz)
@classmethod
def convert_to_utc(cls, dt: datetime) -> datetime:
"""
Convert datetime to UTC
Args:
dt: Datetime to convert (must be timezone-aware)
Returns:
Datetime in UTC timezone
"""
if dt.tzinfo is None:
logger.warning("Converting naive datetime to UTC, assuming UTC")
return dt.replace(tzinfo=ZoneInfo("UTC"))
return dt.astimezone(ZoneInfo("UTC"))
@classmethod
def convert_from_utc(cls, dt: datetime, target_timezone: str) -> datetime:
"""
Convert UTC datetime to target timezone
Args:
dt: UTC datetime
target_timezone: Target IANA timezone string
Returns:
Datetime in target timezone
"""
if dt.tzinfo is None:
dt = dt.replace(tzinfo=ZoneInfo("UTC"))
try:
tz = ZoneInfo(target_timezone)
return dt.astimezone(tz)
except Exception as e:
logger.warning(f"Invalid timezone {target_timezone}, using default",
error=str(e))
tz = ZoneInfo(cls.DEFAULT_TIMEZONE)
return dt.astimezone(tz)
@classmethod
def validate_timezone(cls, timezone_str: str) -> bool:
"""
Validate if timezone string is valid
Args:
timezone_str: IANA timezone string to validate
Returns:
True if valid, False otherwise
"""
try:
ZoneInfo(timezone_str)
return True
except Exception:
return False
@classmethod
def get_timezone_offset_hours(cls, timezone_str: str) -> float:
"""
Get current UTC offset for timezone in hours
Args:
timezone_str: IANA timezone string
Returns:
UTC offset in hours (e.g., +2.0 for CEST)
"""
try:
tz = ZoneInfo(timezone_str)
now = datetime.now(tz)
offset_seconds = now.utcoffset().total_seconds()
return offset_seconds / 3600
except Exception as e:
logger.warning(f"Could not get offset for {timezone_str}",
error=str(e))
return 0.0
@classmethod
def is_business_hours(
cls,
dt: Optional[datetime] = None,
timezone_str: str = DEFAULT_TIMEZONE,
start_hour: int = 8,
end_hour: int = 20
) -> bool:
"""
Check if datetime is within business hours
Args:
dt: Datetime to check (defaults to now)
timezone_str: IANA timezone string
start_hour: Business hours start (24h format)
end_hour: Business hours end (24h format)
Returns:
True if within business hours, False otherwise
"""
if dt is None:
dt = cls.get_current_datetime_in_timezone(timezone_str)
elif dt.tzinfo is None:
# Assume it's in the target timezone
tz = ZoneInfo(timezone_str)
dt = dt.replace(tzinfo=tz)
else:
# Convert to target timezone
dt = cls.convert_from_utc(dt, timezone_str)
# Check if weekday (Monday=0, Sunday=6)
if dt.weekday() >= 5: # Saturday or Sunday
return False
# Check if within business hours
return start_hour <= dt.hour < end_hour
@classmethod
def get_next_business_day_at_time(
cls,
target_time: time,
timezone_str: str = DEFAULT_TIMEZONE,
from_datetime: Optional[datetime] = None
) -> datetime:
"""
Get next business day at specific time in timezone
Args:
target_time: Time to schedule (e.g., time(6, 0) for 6 AM)
timezone_str: IANA timezone string
from_datetime: Starting datetime (defaults to now)
Returns:
Next business day at target_time in specified timezone
"""
if from_datetime is None:
current = cls.get_current_datetime_in_timezone(timezone_str)
else:
current = cls.convert_from_utc(from_datetime, timezone_str)
# Start with next day
next_day = current.date()
next_datetime = cls.combine_date_time_in_timezone(
next_day, target_time, timezone_str
)
# If we haven't passed target_time today, use today
if current.time() < target_time:
next_datetime = cls.combine_date_time_in_timezone(
current.date(), target_time, timezone_str
)
# Skip weekends
while next_datetime.weekday() >= 5: # Saturday or Sunday
next_day = next_datetime.date()
from datetime import timedelta
next_day = next_day + timedelta(days=1)
next_datetime = cls.combine_date_time_in_timezone(
next_day, target_time, timezone_str
)
return next_datetime
# Convenience functions for common operations
def get_tenant_current_date(tenant_timezone: str = "Europe/Madrid") -> date:
"""Get current date for tenant's timezone"""
return TimezoneHelper.get_current_date_in_timezone(tenant_timezone)
def get_tenant_current_datetime(tenant_timezone: str = "Europe/Madrid") -> datetime:
"""Get current datetime for tenant's timezone"""
return TimezoneHelper.get_current_datetime_in_timezone(tenant_timezone)
def is_tenant_business_hours(tenant_timezone: str = "Europe/Madrid") -> bool:
"""Check if it's currently business hours for tenant"""
return TimezoneHelper.is_business_hours(timezone_str=tenant_timezone)
def validate_timezone(timezone_str: str) -> bool:
"""Validate timezone string"""
return TimezoneHelper.validate_timezone(timezone_str)