259 lines
8.9 KiB
Python
Executable File
259 lines
8.9 KiB
Python
Executable File
# shared/monitoring/scheduler_metrics.py
|
|
"""
|
|
Scheduler Metrics - Prometheus metrics for production and procurement schedulers
|
|
|
|
Provides comprehensive metrics for monitoring automated daily planning:
|
|
- Scheduler execution success/failure rates
|
|
- Tenant processing times
|
|
- Cache hit rates for forecasts
|
|
- Plan generation statistics
|
|
"""
|
|
|
|
from prometheus_client import Counter, Histogram, Gauge, Info
|
|
import structlog
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
# ================================================================
|
|
# PRODUCTION SCHEDULER METRICS
|
|
# ================================================================
|
|
|
|
production_schedules_generated_total = Counter(
|
|
'production_schedules_generated_total',
|
|
'Total number of production schedules generated',
|
|
['tenant_id', 'status'] # status: success, failure
|
|
)
|
|
|
|
production_schedule_generation_duration_seconds = Histogram(
|
|
'production_schedule_generation_duration_seconds',
|
|
'Time taken to generate production schedule per tenant',
|
|
['tenant_id'],
|
|
buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds
|
|
)
|
|
|
|
production_tenants_processed_total = Counter(
|
|
'production_tenants_processed_total',
|
|
'Total number of tenants processed by production scheduler',
|
|
['status'] # status: success, failure, timeout
|
|
)
|
|
|
|
production_batches_created_total = Counter(
|
|
'production_batches_created_total',
|
|
'Total number of production batches created',
|
|
['tenant_id']
|
|
)
|
|
|
|
production_scheduler_runs_total = Counter(
|
|
'production_scheduler_runs_total',
|
|
'Total number of production scheduler executions',
|
|
['trigger'] # trigger: scheduled, manual, test
|
|
)
|
|
|
|
production_scheduler_errors_total = Counter(
|
|
'production_scheduler_errors_total',
|
|
'Total number of production scheduler errors',
|
|
['error_type']
|
|
)
|
|
|
|
# ================================================================
|
|
# PROCUREMENT SCHEDULER METRICS
|
|
# ================================================================
|
|
|
|
procurement_plans_generated_total = Counter(
|
|
'procurement_plans_generated_total',
|
|
'Total number of procurement plans generated',
|
|
['tenant_id', 'status'] # status: success, failure
|
|
)
|
|
|
|
procurement_plan_generation_duration_seconds = Histogram(
|
|
'procurement_plan_generation_duration_seconds',
|
|
'Time taken to generate procurement plan per tenant',
|
|
['tenant_id'],
|
|
buckets=[1, 5, 10, 30, 60, 120, 180, 300]
|
|
)
|
|
|
|
procurement_tenants_processed_total = Counter(
|
|
'procurement_tenants_processed_total',
|
|
'Total number of tenants processed by procurement scheduler',
|
|
['status'] # status: success, failure, timeout
|
|
)
|
|
|
|
procurement_requirements_created_total = Counter(
|
|
'procurement_requirements_created_total',
|
|
'Total number of procurement requirements created',
|
|
['tenant_id', 'priority'] # priority: critical, high, medium, low
|
|
)
|
|
|
|
procurement_scheduler_runs_total = Counter(
|
|
'procurement_scheduler_runs_total',
|
|
'Total number of procurement scheduler executions',
|
|
['trigger'] # trigger: scheduled, manual, test
|
|
)
|
|
|
|
procurement_plan_rejections_total = Counter(
|
|
'procurement_plan_rejections_total',
|
|
'Total number of procurement plans rejected',
|
|
['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false
|
|
)
|
|
|
|
procurement_plans_by_status = Gauge(
|
|
'procurement_plans_by_status',
|
|
'Number of procurement plans by status',
|
|
['tenant_id', 'status']
|
|
)
|
|
|
|
# ================================================================
|
|
# FORECAST CACHING METRICS
|
|
# ================================================================
|
|
|
|
forecast_cache_hits_total = Counter(
|
|
'forecast_cache_hits_total',
|
|
'Total number of forecast cache hits',
|
|
['tenant_id']
|
|
)
|
|
|
|
forecast_cache_misses_total = Counter(
|
|
'forecast_cache_misses_total',
|
|
'Total number of forecast cache misses',
|
|
['tenant_id']
|
|
)
|
|
|
|
forecast_cache_hit_rate = Gauge(
|
|
'forecast_cache_hit_rate',
|
|
'Forecast cache hit rate percentage (0-100)',
|
|
['tenant_id']
|
|
)
|
|
|
|
forecast_cache_entries_total = Gauge(
|
|
'forecast_cache_entries_total',
|
|
'Total number of entries in forecast cache',
|
|
['cache_type'] # cache_type: single, batch
|
|
)
|
|
|
|
forecast_cache_invalidations_total = Counter(
|
|
'forecast_cache_invalidations_total',
|
|
'Total number of forecast cache invalidations',
|
|
['tenant_id', 'reason'] # reason: model_retrain, manual, expiry
|
|
)
|
|
|
|
# ================================================================
|
|
# GENERAL SCHEDULER HEALTH METRICS
|
|
# ================================================================
|
|
|
|
scheduler_health_status = Gauge(
|
|
'scheduler_health_status',
|
|
'Scheduler health status (1=healthy, 0=unhealthy)',
|
|
['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup
|
|
)
|
|
|
|
scheduler_last_run_timestamp = Gauge(
|
|
'scheduler_last_run_timestamp',
|
|
'Unix timestamp of last scheduler run',
|
|
['service', 'scheduler_type']
|
|
)
|
|
|
|
scheduler_next_run_timestamp = Gauge(
|
|
'scheduler_next_run_timestamp',
|
|
'Unix timestamp of next scheduled run',
|
|
['service', 'scheduler_type']
|
|
)
|
|
|
|
tenant_processing_timeout_total = Counter(
|
|
'tenant_processing_timeout_total',
|
|
'Total number of tenant processing timeouts',
|
|
['service', 'tenant_id'] # service: production, procurement
|
|
)
|
|
|
|
# ================================================================
|
|
# HELPER FUNCTIONS FOR METRICS
|
|
# ================================================================
|
|
|
|
|
|
class SchedulerMetricsCollector:
|
|
"""Helper class for collecting scheduler metrics"""
|
|
|
|
@staticmethod
|
|
def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int):
|
|
"""Record production schedule generation"""
|
|
status = 'success' if success else 'failure'
|
|
production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc()
|
|
production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
|
|
|
|
if success:
|
|
production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created)
|
|
|
|
@staticmethod
|
|
def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int):
|
|
"""Record procurement plan generation"""
|
|
status = 'success' if success else 'failure'
|
|
procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc()
|
|
procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
|
|
|
|
if success:
|
|
procurement_requirements_created_total.labels(
|
|
tenant_id=tenant_id,
|
|
priority='medium' # Default, should be updated with actual priority
|
|
).inc(requirements_count)
|
|
|
|
@staticmethod
|
|
def record_scheduler_run(service: str, trigger: str = 'scheduled'):
|
|
"""Record scheduler execution"""
|
|
if service == 'production':
|
|
production_scheduler_runs_total.labels(trigger=trigger).inc()
|
|
elif service == 'procurement':
|
|
procurement_scheduler_runs_total.labels(trigger=trigger).inc()
|
|
|
|
@staticmethod
|
|
def record_tenant_processing(service: str, status: str):
|
|
"""Record tenant processing result"""
|
|
if service == 'production':
|
|
production_tenants_processed_total.labels(status=status).inc()
|
|
elif service == 'procurement':
|
|
procurement_tenants_processed_total.labels(status=status).inc()
|
|
|
|
@staticmethod
|
|
def record_forecast_cache_lookup(tenant_id: str, hit: bool):
|
|
"""Record forecast cache lookup"""
|
|
if hit:
|
|
forecast_cache_hits_total.labels(tenant_id=tenant_id).inc()
|
|
else:
|
|
forecast_cache_misses_total.labels(tenant_id=tenant_id).inc()
|
|
|
|
@staticmethod
|
|
def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float):
|
|
"""Update forecast cache hit rate"""
|
|
forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent)
|
|
|
|
@staticmethod
|
|
def record_plan_rejection(tenant_id: str, auto_regenerated: bool):
|
|
"""Record procurement plan rejection"""
|
|
procurement_plan_rejections_total.labels(
|
|
tenant_id=tenant_id,
|
|
auto_regenerated='true' if auto_regenerated else 'false'
|
|
).inc()
|
|
|
|
@staticmethod
|
|
def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool):
|
|
"""Update scheduler health status"""
|
|
scheduler_health_status.labels(
|
|
service=service,
|
|
scheduler_type=scheduler_type
|
|
).set(1 if is_healthy else 0)
|
|
|
|
@staticmethod
|
|
def record_timeout(service: str, tenant_id: str):
|
|
"""Record tenant processing timeout"""
|
|
tenant_processing_timeout_total.labels(
|
|
service=service,
|
|
tenant_id=tenant_id
|
|
).inc()
|
|
|
|
|
|
# Global metrics collector instance
|
|
metrics_collector = SchedulerMetricsCollector()
|
|
|
|
|
|
def get_scheduler_metrics_collector() -> SchedulerMetricsCollector:
|
|
"""Get global scheduler metrics collector"""
|
|
return metrics_collector
|