bakery-ia/shared/monitoring/scheduler_metrics.py

# shared/monitoring/scheduler_metrics.py
"""
Scheduler Metrics - Prometheus metrics for production and procurement schedulers

Provides comprehensive metrics for monitoring automated daily planning:
- Scheduler execution success/failure rates
- Tenant processing times
- Cache hit rates for forecasts
- Plan generation statistics
"""

from prometheus_client import Counter, Histogram, Gauge, Info
import structlog

logger = structlog.get_logger()

# ================================================================
# PRODUCTION SCHEDULER METRICS
# ================================================================

production_schedules_generated_total = Counter(
    'production_schedules_generated_total',
    'Total number of production schedules generated',
    ['tenant_id', 'status']  # status: success, failure
)

production_schedule_generation_duration_seconds = Histogram(
    'production_schedule_generation_duration_seconds',
    'Time taken to generate production schedule per tenant',
    ['tenant_id'],
    buckets=[1, 5, 10, 30, 60, 120, 180, 300]  # seconds
)

production_tenants_processed_total = Counter(
    'production_tenants_processed_total',
    'Total number of tenants processed by production scheduler',
    ['status']  # status: success, failure, timeout
)

production_batches_created_total = Counter(
    'production_batches_created_total',
    'Total number of production batches created',
    ['tenant_id']
)

production_scheduler_runs_total = Counter(
    'production_scheduler_runs_total',
    'Total number of production scheduler executions',
    ['trigger']  # trigger: scheduled, manual, test
)

production_scheduler_errors_total = Counter(
    'production_scheduler_errors_total',
    'Total number of production scheduler errors',
    ['error_type']
)

# ================================================================
# PROCUREMENT SCHEDULER METRICS
# ================================================================

procurement_plans_generated_total = Counter(
    'procurement_plans_generated_total',
    'Total number of procurement plans generated',
    ['tenant_id', 'status']  # status: success, failure
)

procurement_plan_generation_duration_seconds = Histogram(
    'procurement_plan_generation_duration_seconds',
    'Time taken to generate procurement plan per tenant',
    ['tenant_id'],
    buckets=[1, 5, 10, 30, 60, 120, 180, 300]
)

procurement_tenants_processed_total = Counter(
    'procurement_tenants_processed_total',
    'Total number of tenants processed by procurement scheduler',
    ['status']  # status: success, failure, timeout
)

procurement_requirements_created_total = Counter(
    'procurement_requirements_created_total',
    'Total number of procurement requirements created',
    ['tenant_id', 'priority']  # priority: critical, high, medium, low
)

procurement_scheduler_runs_total = Counter(
    'procurement_scheduler_runs_total',
    'Total number of procurement scheduler executions',
    ['trigger']  # trigger: scheduled, manual, test
)

procurement_plan_rejections_total = Counter(
    'procurement_plan_rejections_total',
    'Total number of procurement plans rejected',
    ['tenant_id', 'auto_regenerated']  # auto_regenerated: true, false
)

procurement_plans_by_status = Gauge(
    'procurement_plans_by_status',
    'Number of procurement plans by status',
    ['tenant_id', 'status']
)

# ================================================================
# FORECAST CACHING METRICS
# ================================================================

forecast_cache_hits_total = Counter(
    'forecast_cache_hits_total',
    'Total number of forecast cache hits',
    ['tenant_id']
)

forecast_cache_misses_total = Counter(
    'forecast_cache_misses_total',
    'Total number of forecast cache misses',
    ['tenant_id']
)

forecast_cache_hit_rate = Gauge(
    'forecast_cache_hit_rate',
    'Forecast cache hit rate percentage (0-100)',
    ['tenant_id']
)

forecast_cache_entries_total = Gauge(
    'forecast_cache_entries_total',
    'Total number of entries in forecast cache',
    ['cache_type']  # cache_type: single, batch
)

forecast_cache_invalidations_total = Counter(
    'forecast_cache_invalidations_total',
    'Total number of forecast cache invalidations',
    ['tenant_id', 'reason']  # reason: model_retrain, manual, expiry
)

# ================================================================
# GENERAL SCHEDULER HEALTH METRICS
# ================================================================

scheduler_health_status = Gauge(
    'scheduler_health_status',
    'Scheduler health status (1=healthy, 0=unhealthy)',
    ['service', 'scheduler_type']  # service: production, orders; scheduler_type: daily, weekly, cleanup
)

scheduler_last_run_timestamp = Gauge(
    'scheduler_last_run_timestamp',
    'Unix timestamp of last scheduler run',
    ['service', 'scheduler_type']
)

scheduler_next_run_timestamp = Gauge(
    'scheduler_next_run_timestamp',
    'Unix timestamp of next scheduled run',
    ['service', 'scheduler_type']
)

tenant_processing_timeout_total = Counter(
    'tenant_processing_timeout_total',
    'Total number of tenant processing timeouts',
    ['service', 'tenant_id']  # service: production, procurement
)

# ================================================================
# HELPER FUNCTIONS FOR METRICS
# ================================================================


class SchedulerMetricsCollector:
    """Helper class for collecting scheduler metrics"""

    @staticmethod
    def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int):
        """Record production schedule generation"""
        status = 'success' if success else 'failure'
        production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc()
        production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)

        if success:
            production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created)

    @staticmethod
    def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int):
        """Record procurement plan generation"""
        status = 'success' if success else 'failure'
        procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc()
        procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds)

        if success:
            procurement_requirements_created_total.labels(
                tenant_id=tenant_id,
                priority='medium'  # Default, should be updated with actual priority
            ).inc(requirements_count)

    @staticmethod
    def record_scheduler_run(service: str, trigger: str = 'scheduled'):
        """Record scheduler execution"""
        if service == 'production':
            production_scheduler_runs_total.labels(trigger=trigger).inc()
        elif service == 'procurement':
            procurement_scheduler_runs_total.labels(trigger=trigger).inc()

    @staticmethod
    def record_tenant_processing(service: str, status: str):
        """Record tenant processing result"""
        if service == 'production':
            production_tenants_processed_total.labels(status=status).inc()
        elif service == 'procurement':
            procurement_tenants_processed_total.labels(status=status).inc()

    @staticmethod
    def record_forecast_cache_lookup(tenant_id: str, hit: bool):
        """Record forecast cache lookup"""
        if hit:
            forecast_cache_hits_total.labels(tenant_id=tenant_id).inc()
        else:
            forecast_cache_misses_total.labels(tenant_id=tenant_id).inc()

    @staticmethod
    def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float):
        """Update forecast cache hit rate"""
        forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent)

    @staticmethod
    def record_plan_rejection(tenant_id: str, auto_regenerated: bool):
        """Record procurement plan rejection"""
        procurement_plan_rejections_total.labels(
            tenant_id=tenant_id,
            auto_regenerated='true' if auto_regenerated else 'false'
        ).inc()

    @staticmethod
    def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool):
        """Update scheduler health status"""
        scheduler_health_status.labels(
            service=service,
            scheduler_type=scheduler_type
        ).set(1 if is_healthy else 0)

    @staticmethod
    def record_timeout(service: str, tenant_id: str):
        """Record tenant processing timeout"""
        tenant_processing_timeout_total.labels(
            service=service,
            tenant_id=tenant_id
        ).inc()


# Global metrics collector instance
metrics_collector = SchedulerMetricsCollector()


def get_scheduler_metrics_collector() -> SchedulerMetricsCollector:
    """Get global scheduler metrics collector"""
    return metrics_collector