REFACTOR production scheduler
This commit is contained in:
258
shared/monitoring/scheduler_metrics.py
Normal file
258
shared/monitoring/scheduler_metrics.py
Normal file
@@ -0,0 +1,258 @@
|
||||
# shared/monitoring/scheduler_metrics.py
|
||||
"""
|
||||
Scheduler Metrics - Prometheus metrics for production and procurement schedulers
|
||||
|
||||
Provides comprehensive metrics for monitoring automated daily planning:
|
||||
- Scheduler execution success/failure rates
|
||||
- Tenant processing times
|
||||
- Cache hit rates for forecasts
|
||||
- Plan generation statistics
|
||||
"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge, Info
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# ================================================================
|
||||
# PRODUCTION SCHEDULER METRICS
|
||||
# ================================================================
|
||||
|
||||
production_schedules_generated_total = Counter(
|
||||
'production_schedules_generated_total',
|
||||
'Total number of production schedules generated',
|
||||
['tenant_id', 'status'] # status: success, failure
|
||||
)
|
||||
|
||||
production_schedule_generation_duration_seconds = Histogram(
|
||||
'production_schedule_generation_duration_seconds',
|
||||
'Time taken to generate production schedule per tenant',
|
||||
['tenant_id'],
|
||||
buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds
|
||||
)
|
||||
|
||||
production_tenants_processed_total = Counter(
|
||||
'production_tenants_processed_total',
|
||||
'Total number of tenants processed by production scheduler',
|
||||
['status'] # status: success, failure, timeout
|
||||
)
|
||||
|
||||
production_batches_created_total = Counter(
|
||||
'production_batches_created_total',
|
||||
'Total number of production batches created',
|
||||
['tenant_id']
|
||||
)
|
||||
|
||||
production_scheduler_runs_total = Counter(
|
||||
'production_scheduler_runs_total',
|
||||
'Total number of production scheduler executions',
|
||||
['trigger'] # trigger: scheduled, manual, test
|
||||
)
|
||||
|
||||
production_scheduler_errors_total = Counter(
|
||||
'production_scheduler_errors_total',
|
||||
'Total number of production scheduler errors',
|
||||
['error_type']
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# PROCUREMENT SCHEDULER METRICS
|
||||
# ================================================================
|
||||
|
||||
procurement_plans_generated_total = Counter(
|
||||
'procurement_plans_generated_total',
|
||||
'Total number of procurement plans generated',
|
||||
['tenant_id', 'status'] # status: success, failure
|
||||
)
|
||||
|
||||
procurement_plan_generation_duration_seconds = Histogram(
|
||||
'procurement_plan_generation_duration_seconds',
|
||||
'Time taken to generate procurement plan per tenant',
|
||||
['tenant_id'],
|
||||
buckets=[1, 5, 10, 30, 60, 120, 180, 300]
|
||||
)
|
||||
|
||||
procurement_tenants_processed_total = Counter(
|
||||
'procurement_tenants_processed_total',
|
||||
'Total number of tenants processed by procurement scheduler',
|
||||
['status'] # status: success, failure, timeout
|
||||
)
|
||||
|
||||
procurement_requirements_created_total = Counter(
|
||||
'procurement_requirements_created_total',
|
||||
'Total number of procurement requirements created',
|
||||
['tenant_id', 'priority'] # priority: critical, high, medium, low
|
||||
)
|
||||
|
||||
procurement_scheduler_runs_total = Counter(
|
||||
'procurement_scheduler_runs_total',
|
||||
'Total number of procurement scheduler executions',
|
||||
['trigger'] # trigger: scheduled, manual, test
|
||||
)
|
||||
|
||||
procurement_plan_rejections_total = Counter(
|
||||
'procurement_plan_rejections_total',
|
||||
'Total number of procurement plans rejected',
|
||||
['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false
|
||||
)
|
||||
|
||||
procurement_plans_by_status = Gauge(
|
||||
'procurement_plans_by_status',
|
||||
'Number of procurement plans by status',
|
||||
['tenant_id', 'status']
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# FORECAST CACHING METRICS
|
||||
# ================================================================
|
||||
|
||||
forecast_cache_hits_total = Counter(
|
||||
'forecast_cache_hits_total',
|
||||
'Total number of forecast cache hits',
|
||||
['tenant_id']
|
||||
)
|
||||
|
||||
forecast_cache_misses_total = Counter(
|
||||
'forecast_cache_misses_total',
|
||||
'Total number of forecast cache misses',
|
||||
['tenant_id']
|
||||
)
|
||||
|
||||
forecast_cache_hit_rate = Gauge(
|
||||
'forecast_cache_hit_rate',
|
||||
'Forecast cache hit rate percentage (0-100)',
|
||||
['tenant_id']
|
||||
)
|
||||
|
||||
forecast_cache_entries_total = Gauge(
|
||||
'forecast_cache_entries_total',
|
||||
'Total number of entries in forecast cache',
|
||||
['cache_type'] # cache_type: single, batch
|
||||
)
|
||||
|
||||
forecast_cache_invalidations_total = Counter(
|
||||
'forecast_cache_invalidations_total',
|
||||
'Total number of forecast cache invalidations',
|
||||
['tenant_id', 'reason'] # reason: model_retrain, manual, expiry
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# GENERAL SCHEDULER HEALTH METRICS
|
||||
# ================================================================
|
||||
|
||||
scheduler_health_status = Gauge(
|
||||
'scheduler_health_status',
|
||||
'Scheduler health status (1=healthy, 0=unhealthy)',
|
||||
['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup
|
||||
)
|
||||
|
||||
scheduler_last_run_timestamp = Gauge(
|
||||
'scheduler_last_run_timestamp',
|
||||
'Unix timestamp of last scheduler run',
|
||||
['service', 'scheduler_type']
|
||||
)
|
||||
|
||||
scheduler_next_run_timestamp = Gauge(
|
||||
'scheduler_next_run_timestamp',
|
||||
'Unix timestamp of next scheduled run',
|
||||
['service', 'scheduler_type']
|
||||
)
|
||||
|
||||
tenant_processing_timeout_total = Counter(
|
||||
'tenant_processing_timeout_total',
|
||||
'Total number of tenant processing timeouts',
|
||||
['service', 'tenant_id'] # service: production, procurement
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# HELPER FUNCTIONS FOR METRICS
|
||||
# ================================================================
|
||||
|
||||
|
||||
class SchedulerMetricsCollector:
|
||||
"""Helper class for collecting scheduler metrics"""
|
||||
|
||||
@staticmethod
|
||||
def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int):
|
||||
"""Record production schedule generation"""
|
||||
status = 'success' if success else 'failure'
|
||||
production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc()
|
||||
production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
|
||||
|
||||
if success:
|
||||
production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created)
|
||||
|
||||
@staticmethod
|
||||
def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int):
|
||||
"""Record procurement plan generation"""
|
||||
status = 'success' if success else 'failure'
|
||||
procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc()
|
||||
procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)
|
||||
|
||||
if success:
|
||||
procurement_requirements_created_total.labels(
|
||||
tenant_id=tenant_id,
|
||||
priority='medium' # Default, should be updated with actual priority
|
||||
).inc(requirements_count)
|
||||
|
||||
@staticmethod
|
||||
def record_scheduler_run(service: str, trigger: str = 'scheduled'):
|
||||
"""Record scheduler execution"""
|
||||
if service == 'production':
|
||||
production_scheduler_runs_total.labels(trigger=trigger).inc()
|
||||
elif service == 'procurement':
|
||||
procurement_scheduler_runs_total.labels(trigger=trigger).inc()
|
||||
|
||||
@staticmethod
|
||||
def record_tenant_processing(service: str, status: str):
|
||||
"""Record tenant processing result"""
|
||||
if service == 'production':
|
||||
production_tenants_processed_total.labels(status=status).inc()
|
||||
elif service == 'procurement':
|
||||
procurement_tenants_processed_total.labels(status=status).inc()
|
||||
|
||||
@staticmethod
|
||||
def record_forecast_cache_lookup(tenant_id: str, hit: bool):
|
||||
"""Record forecast cache lookup"""
|
||||
if hit:
|
||||
forecast_cache_hits_total.labels(tenant_id=tenant_id).inc()
|
||||
else:
|
||||
forecast_cache_misses_total.labels(tenant_id=tenant_id).inc()
|
||||
|
||||
@staticmethod
|
||||
def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float):
|
||||
"""Update forecast cache hit rate"""
|
||||
forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent)
|
||||
|
||||
@staticmethod
|
||||
def record_plan_rejection(tenant_id: str, auto_regenerated: bool):
|
||||
"""Record procurement plan rejection"""
|
||||
procurement_plan_rejections_total.labels(
|
||||
tenant_id=tenant_id,
|
||||
auto_regenerated='true' if auto_regenerated else 'false'
|
||||
).inc()
|
||||
|
||||
@staticmethod
|
||||
def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool):
|
||||
"""Update scheduler health status"""
|
||||
scheduler_health_status.labels(
|
||||
service=service,
|
||||
scheduler_type=scheduler_type
|
||||
).set(1 if is_healthy else 0)
|
||||
|
||||
@staticmethod
|
||||
def record_timeout(service: str, tenant_id: str):
|
||||
"""Record tenant processing timeout"""
|
||||
tenant_processing_timeout_total.labels(
|
||||
service=service,
|
||||
tenant_id=tenant_id
|
||||
).inc()
|
||||
|
||||
|
||||
# Global metrics collector instance
|
||||
metrics_collector = SchedulerMetricsCollector()
|
||||
|
||||
|
||||
def get_scheduler_metrics_collector() -> SchedulerMetricsCollector:
|
||||
"""Get global scheduler metrics collector"""
|
||||
return metrics_collector
|
||||
Reference in New Issue
Block a user