Files
bakery-ia/shared/monitoring/scheduler_metrics.py
2025-10-09 18:01:24 +02:00

259 lines
8.9 KiB
Python

# shared/monitoring/scheduler_metrics.py
"""
Scheduler Metrics - Prometheus metrics for production and procurement schedulers
Provides comprehensive metrics for monitoring automated daily planning:
- Scheduler execution success/failure rates
- Tenant processing times
- Cache hit rates for forecasts
- Plan generation statistics
"""
from prometheus_client import Counter, Histogram, Gauge, Info
import structlog
logger = structlog.get_logger()
# ================================================================
# PRODUCTION SCHEDULER METRICS
# ================================================================
production_schedules_generated_total = Counter(
'production_schedules_generated_total',
'Total number of production schedules generated',
['tenant_id', 'status'] # status: success, failure
)
production_schedule_generation_duration_seconds = Histogram(
'production_schedule_generation_duration_seconds',
'Time taken to generate production schedule per tenant',
['tenant_id'],
buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds
)
production_tenants_processed_total = Counter(
'production_tenants_processed_total',
'Total number of tenants processed by production scheduler',
['status'] # status: success, failure, timeout
)
production_batches_created_total = Counter(
'production_batches_created_total',
'Total number of production batches created',
['tenant_id']
)
production_scheduler_runs_total = Counter(
'production_scheduler_runs_total',
'Total number of production scheduler executions',
['trigger'] # trigger: scheduled, manual, test
)
production_scheduler_errors_total = Counter(
'production_scheduler_errors_total',
'Total number of production scheduler errors',
['error_type']
)
# ================================================================
# PROCUREMENT SCHEDULER METRICS
# ================================================================
procurement_plans_generated_total = Counter(
'procurement_plans_generated_total',
'Total number of procurement plans generated',
['tenant_id', 'status'] # status: success, failure
)
procurement_plan_generation_duration_seconds = Histogram(
'procurement_plan_generation_duration_seconds',
'Time taken to generate procurement plan per tenant',
['tenant_id'],
buckets=[1, 5, 10, 30, 60, 120, 180, 300]
)
procurement_tenants_processed_total = Counter(
'procurement_tenants_processed_total',
'Total number of tenants processed by procurement scheduler',
['status'] # status: success, failure, timeout
)
procurement_requirements_created_total = Counter(
'procurement_requirements_created_total',
'Total number of procurement requirements created',
['tenant_id', 'priority'] # priority: critical, high, medium, low
)
procurement_scheduler_runs_total = Counter(
'procurement_scheduler_runs_total',
'Total number of procurement scheduler executions',
['trigger'] # trigger: scheduled, manual, test
)
procurement_plan_rejections_total = Counter(
'procurement_plan_rejections_total',
'Total number of procurement plans rejected',
['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false
)
procurement_plans_by_status = Gauge(
'procurement_plans_by_status',
'Number of procurement plans by status',
['tenant_id', 'status']
)
# ================================================================
# FORECAST CACHING METRICS
# ================================================================
forecast_cache_hits_total = Counter(
'forecast_cache_hits_total',
'Total number of forecast cache hits',
['tenant_id']
)
forecast_cache_misses_total = Counter(
'forecast_cache_misses_total',
'Total number of forecast cache misses',
['tenant_id']
)
forecast_cache_hit_rate = Gauge(
'forecast_cache_hit_rate',
'Forecast cache hit rate percentage (0-100)',
['tenant_id']
)
forecast_cache_entries_total = Gauge(
'forecast_cache_entries_total',
'Total number of entries in forecast cache',
['cache_type'] # cache_type: single, batch
)
forecast_cache_invalidations_total = Counter(
'forecast_cache_invalidations_total',
'Total number of forecast cache invalidations',
['tenant_id', 'reason'] # reason: model_retrain, manual, expiry
)
# ================================================================
# GENERAL SCHEDULER HEALTH METRICS
# ================================================================
scheduler_health_status = Gauge(
'scheduler_health_status',
'Scheduler health status (1=healthy, 0=unhealthy)',
['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup
)
scheduler_last_run_timestamp = Gauge(
'scheduler_last_run_timestamp',
'Unix timestamp of last scheduler run',
['service', 'scheduler_type']
)
scheduler_next_run_timestamp = Gauge(
'scheduler_next_run_timestamp',
'Unix timestamp of next scheduled run',
['service', 'scheduler_type']
)
tenant_processing_timeout_total = Counter(
'tenant_processing_timeout_total',
'Total number of tenant processing timeouts',
['service', 'tenant_id'] # service: production, procurement
)
# ================================================================
# HELPER FUNCTIONS FOR METRICS
# ================================================================
class SchedulerMetricsCollector:
"""Helper class for collecting scheduler metrics"""
@staticmethod
def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int):
"""Record production schedule generation"""
status = 'success' if success else 'failure'
production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc()
production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
if success:
production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created)
@staticmethod
def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int):
"""Record procurement plan generation"""
status = 'success' if success else 'failure'
procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc()
procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
if success:
procurement_requirements_created_total.labels(
tenant_id=tenant_id,
priority='medium' # Default, should be updated with actual priority
).inc(requirements_count)
@staticmethod
def record_scheduler_run(service: str, trigger: str = 'scheduled'):
"""Record scheduler execution"""
if service == 'production':
production_scheduler_runs_total.labels(trigger=trigger).inc()
elif service == 'procurement':
procurement_scheduler_runs_total.labels(trigger=trigger).inc()
@staticmethod
def record_tenant_processing(service: str, status: str):
"""Record tenant processing result"""
if service == 'production':
production_tenants_processed_total.labels(status=status).inc()
elif service == 'procurement':
procurement_tenants_processed_total.labels(status=status).inc()
@staticmethod
def record_forecast_cache_lookup(tenant_id: str, hit: bool):
"""Record forecast cache lookup"""
if hit:
forecast_cache_hits_total.labels(tenant_id=tenant_id).inc()
else:
forecast_cache_misses_total.labels(tenant_id=tenant_id).inc()
@staticmethod
def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float):
"""Update forecast cache hit rate"""
forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent)
@staticmethod
def record_plan_rejection(tenant_id: str, auto_regenerated: bool):
"""Record procurement plan rejection"""
procurement_plan_rejections_total.labels(
tenant_id=tenant_id,
auto_regenerated='true' if auto_regenerated else 'false'
).inc()
@staticmethod
def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool):
"""Update scheduler health status"""
scheduler_health_status.labels(
service=service,
scheduler_type=scheduler_type
).set(1 if is_healthy else 0)
@staticmethod
def record_timeout(service: str, tenant_id: str):
"""Record tenant processing timeout"""
tenant_processing_timeout_total.labels(
service=service,
tenant_id=tenant_id
).inc()
# Global metrics collector instance
metrics_collector = SchedulerMetricsCollector()
def get_scheduler_metrics_collector() -> SchedulerMetricsCollector:
"""Get global scheduler metrics collector"""
return metrics_collector