# shared/monitoring/scheduler_metrics.py """ Scheduler Metrics - Prometheus metrics for production and procurement schedulers Provides comprehensive metrics for monitoring automated daily planning: - Scheduler execution success/failure rates - Tenant processing times - Cache hit rates for forecasts - Plan generation statistics """ from prometheus_client import Counter, Histogram, Gauge, Info import structlog logger = structlog.get_logger() # ================================================================ # PRODUCTION SCHEDULER METRICS # ================================================================ production_schedules_generated_total = Counter( 'production_schedules_generated_total', 'Total number of production schedules generated', ['tenant_id', 'status'] # status: success, failure ) production_schedule_generation_duration_seconds = Histogram( 'production_schedule_generation_duration_seconds', 'Time taken to generate production schedule per tenant', ['tenant_id'], buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds ) production_tenants_processed_total = Counter( 'production_tenants_processed_total', 'Total number of tenants processed by production scheduler', ['status'] # status: success, failure, timeout ) production_batches_created_total = Counter( 'production_batches_created_total', 'Total number of production batches created', ['tenant_id'] ) production_scheduler_runs_total = Counter( 'production_scheduler_runs_total', 'Total number of production scheduler executions', ['trigger'] # trigger: scheduled, manual, test ) production_scheduler_errors_total = Counter( 'production_scheduler_errors_total', 'Total number of production scheduler errors', ['error_type'] ) # ================================================================ # PROCUREMENT SCHEDULER METRICS # ================================================================ procurement_plans_generated_total = Counter( 'procurement_plans_generated_total', 'Total number of procurement plans generated', ['tenant_id', 'status'] # status: success, failure ) procurement_plan_generation_duration_seconds = Histogram( 'procurement_plan_generation_duration_seconds', 'Time taken to generate procurement plan per tenant', ['tenant_id'], buckets=[1, 5, 10, 30, 60, 120, 180, 300] ) procurement_tenants_processed_total = Counter( 'procurement_tenants_processed_total', 'Total number of tenants processed by procurement scheduler', ['status'] # status: success, failure, timeout ) procurement_requirements_created_total = Counter( 'procurement_requirements_created_total', 'Total number of procurement requirements created', ['tenant_id', 'priority'] # priority: critical, high, medium, low ) procurement_scheduler_runs_total = Counter( 'procurement_scheduler_runs_total', 'Total number of procurement scheduler executions', ['trigger'] # trigger: scheduled, manual, test ) procurement_plan_rejections_total = Counter( 'procurement_plan_rejections_total', 'Total number of procurement plans rejected', ['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false ) procurement_plans_by_status = Gauge( 'procurement_plans_by_status', 'Number of procurement plans by status', ['tenant_id', 'status'] ) # ================================================================ # FORECAST CACHING METRICS # ================================================================ forecast_cache_hits_total = Counter( 'forecast_cache_hits_total', 'Total number of forecast cache hits', ['tenant_id'] ) forecast_cache_misses_total = Counter( 'forecast_cache_misses_total', 'Total number of forecast cache misses', ['tenant_id'] ) forecast_cache_hit_rate = Gauge( 'forecast_cache_hit_rate', 'Forecast cache hit rate percentage (0-100)', ['tenant_id'] ) forecast_cache_entries_total = Gauge( 'forecast_cache_entries_total', 'Total number of entries in forecast cache', ['cache_type'] # cache_type: single, batch ) forecast_cache_invalidations_total = Counter( 'forecast_cache_invalidations_total', 'Total number of forecast cache invalidations', ['tenant_id', 'reason'] # reason: model_retrain, manual, expiry ) # ================================================================ # GENERAL SCHEDULER HEALTH METRICS # ================================================================ scheduler_health_status = Gauge( 'scheduler_health_status', 'Scheduler health status (1=healthy, 0=unhealthy)', ['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup ) scheduler_last_run_timestamp = Gauge( 'scheduler_last_run_timestamp', 'Unix timestamp of last scheduler run', ['service', 'scheduler_type'] ) scheduler_next_run_timestamp = Gauge( 'scheduler_next_run_timestamp', 'Unix timestamp of next scheduled run', ['service', 'scheduler_type'] ) tenant_processing_timeout_total = Counter( 'tenant_processing_timeout_total', 'Total number of tenant processing timeouts', ['service', 'tenant_id'] # service: production, procurement ) # ================================================================ # HELPER FUNCTIONS FOR METRICS # ================================================================ class SchedulerMetricsCollector: """Helper class for collecting scheduler metrics""" @staticmethod def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int): """Record production schedule generation""" status = 'success' if success else 'failure' production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc() production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds) if success: production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created) @staticmethod def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int): """Record procurement plan generation""" status = 'success' if success else 'failure' procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc() procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds) if success: procurement_requirements_created_total.labels( tenant_id=tenant_id, priority='medium' # Default, should be updated with actual priority ).inc(requirements_count) @staticmethod def record_scheduler_run(service: str, trigger: str = 'scheduled'): """Record scheduler execution""" if service == 'production': production_scheduler_runs_total.labels(trigger=trigger).inc() elif service == 'procurement': procurement_scheduler_runs_total.labels(trigger=trigger).inc() @staticmethod def record_tenant_processing(service: str, status: str): """Record tenant processing result""" if service == 'production': production_tenants_processed_total.labels(status=status).inc() elif service == 'procurement': procurement_tenants_processed_total.labels(status=status).inc() @staticmethod def record_forecast_cache_lookup(tenant_id: str, hit: bool): """Record forecast cache lookup""" if hit: forecast_cache_hits_total.labels(tenant_id=tenant_id).inc() else: forecast_cache_misses_total.labels(tenant_id=tenant_id).inc() @staticmethod def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float): """Update forecast cache hit rate""" forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent) @staticmethod def record_plan_rejection(tenant_id: str, auto_regenerated: bool): """Record procurement plan rejection""" procurement_plan_rejections_total.labels( tenant_id=tenant_id, auto_regenerated='true' if auto_regenerated else 'false' ).inc() @staticmethod def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool): """Update scheduler health status""" scheduler_health_status.labels( service=service, scheduler_type=scheduler_type ).set(1 if is_healthy else 0) @staticmethod def record_timeout(service: str, tenant_id: str): """Record tenant processing timeout""" tenant_processing_timeout_total.labels( service=service, tenant_id=tenant_id ).inc() # Global metrics collector instance metrics_collector = SchedulerMetricsCollector() def get_scheduler_metrics_collector() -> SchedulerMetricsCollector: """Get global scheduler metrics collector""" return metrics_collector