From e8fda39e500b3011e8f87020c21e39f06b3cf910 Mon Sep 17 00:00:00 2001 From: Urtzi Alfaro Date: Thu, 8 Jan 2026 20:48:24 +0100 Subject: [PATCH] Improve metrics --- DOCKERHUB_QUICKSTART.md | 134 ----- docs/DEV-HTTPS-SETUP.md | 337 ----------- docs/DOCKERHUB_SETUP.md | 337 ----------- docs/MONITORING_COMPLETE_GUIDE.md | 449 --------------- docs/MONITORING_DOCUMENTATION.md | 536 ++++++++++++++++++ docs/MONITORING_QUICKSTART.md | 283 --------- docs/MONITORING_SETUP.md | 511 ----------------- gateway/app/main.py | 17 +- gateway/requirements.txt | 3 + .../kubernetes/setup-database-monitoring.sh | 133 ----- services/ai_insights/app/main.py | 19 +- services/alert_processor/app/main.py | 19 +- services/demo_session/app/main.py | 19 +- .../demo_session/app/monitoring/metrics.py | 85 --- .../app/services/cleanup_service.py | 5 - .../app/services/clone_orchestrator.py | 11 - services/notification/app/main.py | 19 +- services/tenant/app/main.py | 19 +- services/training/app/main.py | 20 +- shared/monitoring/alert_metrics.py | 420 -------------- shared/monitoring/scheduler_metrics.py | 258 --------- 21 files changed, 615 insertions(+), 3019 deletions(-) delete mode 100644 DOCKERHUB_QUICKSTART.md delete mode 100644 docs/DEV-HTTPS-SETUP.md delete mode 100644 docs/DOCKERHUB_SETUP.md delete mode 100644 docs/MONITORING_COMPLETE_GUIDE.md create mode 100644 docs/MONITORING_DOCUMENTATION.md delete mode 100644 docs/MONITORING_QUICKSTART.md delete mode 100644 docs/MONITORING_SETUP.md delete mode 100755 infrastructure/kubernetes/setup-database-monitoring.sh delete mode 100644 services/demo_session/app/monitoring/metrics.py delete mode 100755 shared/monitoring/alert_metrics.py delete mode 100755 shared/monitoring/scheduler_metrics.py diff --git a/DOCKERHUB_QUICKSTART.md b/DOCKERHUB_QUICKSTART.md deleted file mode 100644 index b31b8ae6..00000000 --- a/DOCKERHUB_QUICKSTART.md +++ /dev/null @@ -1,134 +0,0 @@ -# Docker Hub Quick Start Guide - -## πŸš€ Quick Setup (3 Steps) - -### 1. Create Docker Hub Secrets - -```bash -./infrastructure/kubernetes/setup-dockerhub-secrets.sh -``` - -This creates the `dockerhub-creds` secret in all namespaces with your Docker Hub credentials. - -### 2. Apply Updated Manifests - -```bash -# Development environment -kubectl apply -k infrastructure/kubernetes/overlays/dev - -# Production environment -kubectl apply -k infrastructure/kubernetes/overlays/prod -``` - -### 3. Verify Pods Are Running - -```bash -kubectl get pods -n bakery-ia -``` - -All pods should now be able to pull images from Docker Hub! - ---- - -## πŸ”§ What Was Configured - -βœ… **Docker Hub Credentials** -- Username: `uals` -- Access Token: `dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A` -- Email: `ualfaro@gmail.com` - -βœ… **Kubernetes Secrets** -- Created in: `bakery-ia`, `bakery-ia-dev`, `bakery-ia-prod`, `default` -- Secret name: `dockerhub-creds` - -βœ… **Manifests Updated (47 files)** -- All service deployments -- All database deployments -- All migration jobs -- All cronjobs and standalone jobs - -βœ… **Tiltfile Configuration** -- Supports both local registry and Docker Hub -- Use `export USE_DOCKERHUB=true` to enable Docker Hub mode - ---- - -## πŸ“– Full Documentation - -See [docs/DOCKERHUB_SETUP.md](docs/DOCKERHUB_SETUP.md) for: -- Detailed configuration steps -- Troubleshooting guide -- Security best practices -- Image management -- Rate limits information - ---- - -## πŸ”„ Using with Tilt (Local Development) - -**Default: Local Registry** -```bash -tilt up -``` - -**Docker Hub Mode** -```bash -export USE_DOCKERHUB=true -export DOCKERHUB_USERNAME=uals -docker login -u uals -tilt up -``` - ---- - -## 🐳 Pushing Images to Docker Hub - -```bash -# Login first -docker login -u uals - -# Use the automated script -./scripts/tag-and-push-images.sh -``` - ---- - -## ⚠️ Troubleshooting - -**Problem: ImagePullBackOff** -```bash -# Check if secret exists -kubectl get secret dockerhub-creds -n bakery-ia - -# Recreate secret if needed -./infrastructure/kubernetes/setup-dockerhub-secrets.sh -``` - -**Problem: Pods not using new credentials** -```bash -# Restart deployment -kubectl rollout restart deployment/ -n bakery-ia -``` - ---- - -## πŸ“ Scripts Reference - -| Script | Purpose | -|--------|---------| -| `infrastructure/kubernetes/setup-dockerhub-secrets.sh` | Create Docker Hub secrets in all namespaces | -| `infrastructure/kubernetes/add-image-pull-secrets.sh` | Add imagePullSecrets to manifests (already done) | -| `scripts/tag-and-push-images.sh` | Tag and push all custom images to Docker Hub | - ---- - -## βœ… Verification Checklist - -- [ ] Docker Hub secret created: `kubectl get secret dockerhub-creds -n bakery-ia` -- [ ] Manifests applied: `kubectl apply -k infrastructure/kubernetes/overlays/dev` -- [ ] Pods running: `kubectl get pods -n bakery-ia` -- [ ] No ImagePullBackOff errors: `kubectl get events -n bakery-ia` - ---- - -**Need help?** See the full documentation at [docs/DOCKERHUB_SETUP.md](docs/DOCKERHUB_SETUP.md) diff --git a/docs/DEV-HTTPS-SETUP.md b/docs/DEV-HTTPS-SETUP.md deleted file mode 100644 index cb25d69c..00000000 --- a/docs/DEV-HTTPS-SETUP.md +++ /dev/null @@ -1,337 +0,0 @@ -# HTTPS in Development Environment - -## Overview - -Development environment now uses HTTPS by default to match production behavior and catch SSL-related issues early. - -**Benefits:** -- βœ… Matches production HTTPS behavior -- βœ… Tests SSL/TLS configurations -- βœ… Catches mixed content warnings -- βœ… Tests secure cookie handling -- βœ… Better dev-prod parity - ---- - -## Quick Start - -### 1. Deploy with HTTPS Enabled - -```bash -# Start development environment -skaffold dev --profile=dev - -# Wait for certificate to be issued -kubectl get certificate -n bakery-ia - -# You should see: -# NAME READY SECRET AGE -# bakery-dev-tls-cert True bakery-dev-tls-cert 1m -``` - -### 2. Access Your Application - -```bash -# Access via HTTPS (will show certificate warning in browser) -open https://localhost - -# Or via curl (use -k to skip certificate verification) -curl -k https://localhost/api/health -``` - ---- - -## Trust the Self-Signed Certificate - -To avoid browser certificate warnings, you need to trust the self-signed certificate. - -### Option 1: Accept Browser Warning (Quick & Easy) - -When you visit `https://localhost`: -1. Browser shows "Your connection is not private" or similar -2. Click "Advanced" or "Show details" -3. Click "Proceed to localhost" or "Accept the risk" -4. Certificate warning will appear on first visit only per browser session - -### Option 2: Trust Certificate in System (Recommended) - -#### On macOS: - -```bash -# 1. Export the certificate from Kubernetes -kubectl get secret bakery-dev-tls-cert -n bakery-ia -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/bakery-dev-cert.crt - -# 2. Add to Keychain -sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain /tmp/bakery-dev-cert.crt - -# 3. Verify -security find-certificate -c localhost -a - -# 4. Cleanup -rm /tmp/bakery-dev-cert.crt -``` - -**Alternative (GUI):** -1. Export certificate: `kubectl get secret bakery-dev-tls-cert -n bakery-ia -o jsonpath='{.data.tls\.crt}' | base64 -d > bakery-dev-cert.crt` -2. Double-click the `.crt` file to open Keychain Access -3. Find "localhost" certificate -4. Double-click β†’ Trust β†’ "Always Trust" -5. Close and enter your password - -#### On Linux: - -```bash -# 1. Export the certificate -kubectl get secret bakery-dev-tls-cert -n bakery-ia -o jsonpath='{.data.tls\.crt}' | base64 -d | sudo tee /usr/local/share/ca-certificates/bakery-dev.crt - -# 2. Update CA certificates -sudo update-ca-certificates - -# 3. For browsers (Chromium/Chrome) -mkdir -p $HOME/.pki/nssdb -certutil -d sql:$HOME/.pki/nssdb -A -t "P,," -n "Bakery Dev" -i /usr/local/share/ca-certificates/bakery-dev.crt -``` - -#### On Windows: - -```powershell -# 1. Export the certificate -kubectl get secret bakery-dev-tls-cert -n bakery-ia -o jsonpath='{.data.tls.crt}' | Out-File -Encoding ASCII bakery-dev-cert.crt - -# 2. Import to Trusted Root -Import-Certificate -FilePath .\bakery-dev-cert.crt -CertStoreLocation Cert:\LocalMachine\Root - -# Or use GUI: -# - Double-click bakery-dev-cert.crt -# - Install Certificate -# - Store Location: Local Machine -# - Place in: Trusted Root Certification Authorities -``` - ---- - -## Testing HTTPS - -### Test with curl - -```bash -# Without certificate verification (quick test) -curl -k https://localhost/api/health - -# With certificate verification (after trusting cert) -curl https://localhost/api/health - -# Check certificate details -curl -vI https://localhost/api/health 2>&1 | grep -A 10 "Server certificate" - -# Test CORS with HTTPS -curl -H "Origin: https://localhost:3000" \ - -H "Access-Control-Request-Method: POST" \ - -X OPTIONS https://localhost/api/health -``` - -### Test with Browser - -1. Open `https://localhost` -2. Check for SSL/TLS padlock in address bar -3. Click padlock β†’ View certificate -4. Verify: - - Issued to: localhost - - Issued by: localhost (self-signed) - - Valid for: 90 days - -### Test Frontend - -```bash -# Update your frontend .env to use HTTPS -echo "VITE_API_URL=https://localhost/api" > frontend/.env.local - -# Frontend should now make HTTPS requests -``` - ---- - -## Certificate Details - -### Certificate Specifications - -- **Type**: Self-signed (for development) -- **Algorithm**: RSA 2048-bit -- **Validity**: 90 days (auto-renews 15 days before expiration) -- **Common Name**: localhost -- **DNS Names**: - - localhost - - bakery-ia.local - - api.bakery-ia.local - - *.bakery-ia.local -- **IP Addresses**: 127.0.0.1, ::1 - -### Certificate Issuer - -- **Issuer**: `selfsigned-issuer` (cert-manager ClusterIssuer) -- **Auto-renewal**: Managed by cert-manager -- **Secret Name**: `bakery-dev-tls-cert` - ---- - -## Troubleshooting - -### Certificate Not Issued - -```bash -# Check certificate status -kubectl describe certificate bakery-dev-tls-cert -n bakery-ia - -# Check cert-manager logs -kubectl logs -n cert-manager deployment/cert-manager - -# Check if cert-manager is installed -kubectl get pods -n cert-manager - -# If cert-manager is not installed: -kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml -``` - -### Certificate Warning in Browser - -**Normal for self-signed certificates!** Choose one: -1. Click "Proceed" (quick, temporary) -2. Trust the certificate in your system (permanent) - -### Mixed Content Warnings - -If you see "mixed content" errors: -- Ensure all API calls use HTTPS -- Check for hardcoded HTTP URLs -- Update `VITE_API_URL` to use HTTPS - -### Certificate Expired - -```bash -# Check expiration -kubectl get certificate bakery-dev-tls-cert -n bakery-ia -o jsonpath='{.status.notAfter}' - -# Force renewal -kubectl delete certificate bakery-dev-tls-cert -n bakery-ia -kubectl apply -k infrastructure/kubernetes/overlays/dev - -# cert-manager will automatically recreate it -``` - -### Browser Shows "NET::ERR_CERT_AUTHORITY_INVALID" - -This is expected for self-signed certificates. Options: -1. Click "Advanced" β†’ "Proceed to localhost" -2. Trust the certificate (see instructions above) -3. Use curl with `-k` flag for testing - ---- - -## Disable HTTPS (Not Recommended) - -If you need to temporarily disable HTTPS: - -```bash -# Edit dev-ingress.yaml -vim infrastructure/kubernetes/overlays/dev/dev-ingress.yaml - -# Change: -# nginx.ingress.kubernetes.io/ssl-redirect: "true" β†’ "false" -# nginx.ingress.kubernetes.io/force-ssl-redirect: "true" β†’ "false" - -# Comment out the tls section: -# tls: -# - hosts: -# - localhost -# secretName: bakery-dev-tls-cert - -# Redeploy -skaffold dev --profile=dev -``` - ---- - -## Differences from Production - -| Aspect | Development | Production | -|--------|-------------|------------| -| Certificate Type | Self-signed | Let's Encrypt | -| Validity | 90 days | 90 days | -| Auto-renewal | cert-manager | cert-manager | -| Trust | Manual trust needed | Automatically trusted | -| Domains | localhost | Real domains | -| Browser Warning | Yes (self-signed) | No (CA-signed) | - ---- - -## FAQ - -### Q: Why am I seeing certificate warnings? -**A:** Self-signed certificates aren't trusted by browsers by default. Trust the certificate or click "Proceed." - -### Q: Do I need to trust the certificate? -**A:** No, but it makes development easier. You can click "Proceed" on each browser session. - -### Q: Will this affect my frontend development? -**A:** Slightly. Update `VITE_API_URL` to use `https://`. Otherwise works the same. - -### Q: Can I use HTTP instead? -**A:** Yes, but not recommended. It reduces dev-prod parity and won't catch HTTPS issues. - -### Q: How often do I need to re-trust the certificate? -**A:** Only when the certificate is recreated (every 90 days or when you delete the cluster). - -### Q: Does this work with bakery-ia.local? -**A:** Yes! The certificate is valid for both `localhost` and `bakery-ia.local`. - ---- - -## Additional Security Testing - -With HTTPS enabled, you can now test: - -### 1. Secure Cookies -```javascript -// In your frontend -document.cookie = "session=test; Secure; SameSite=Strict"; -``` - -### 2. Mixed Content Detection -```javascript -// This will show warning in dev (good - catches prod issues!) -fetch('http://api.example.com/data') // ❌ Mixed content -fetch('https://api.example.com/data') // βœ… Secure -``` - -### 3. HSTS (HTTP Strict Transport Security) -```bash -# Check HSTS headers -curl -I https://localhost/api/health | grep -i strict -``` - -### 4. TLS Version Testing -```bash -# Test TLS 1.2 -curl --tlsv1.2 https://localhost/api/health - -# Test TLS 1.3 -curl --tlsv1.3 https://localhost/api/health -``` - ---- - -## Summary - -βœ… **Enabled**: HTTPS in development by default -βœ… **Certificate**: Self-signed, auto-renewed -βœ… **Access**: `https://localhost` -βœ… **Trust**: Optional but recommended -βœ… **Benefit**: Better dev-prod parity - -**Next Steps:** -1. Deploy: `skaffold dev --profile=dev` -2. Access: `https://localhost` -3. Trust: Follow instructions above (optional) -4. Test: Verify HTTPS works - -For issues, see Troubleshooting section or check cert-manager logs. diff --git a/docs/DOCKERHUB_SETUP.md b/docs/DOCKERHUB_SETUP.md deleted file mode 100644 index 5140518b..00000000 --- a/docs/DOCKERHUB_SETUP.md +++ /dev/null @@ -1,337 +0,0 @@ -# Docker Hub Configuration Guide - -This guide explains how to configure Docker Hub for all image pulls in the Bakery IA project. - -## Overview - -The project has been configured to use Docker Hub credentials for pulling both: -- **Base images** (postgres, redis, python, node, nginx, etc.) -- **Custom bakery images** (bakery/auth-service, bakery/gateway, etc.) - -## Quick Start - -### 1. Create Docker Hub Secret in Kubernetes - -Run the automated setup script: - -```bash -./infrastructure/kubernetes/setup-dockerhub-secrets.sh -``` - -This script will: -- Create the `dockerhub-creds` secret in all namespaces (bakery-ia, bakery-ia-dev, bakery-ia-prod, default) -- Use the credentials: `uals` / `dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A` - -### 2. Apply Updated Kubernetes Manifests - -All manifests have been updated with `imagePullSecrets`. Apply them: - -```bash -# For development -kubectl apply -k infrastructure/kubernetes/overlays/dev - -# For production -kubectl apply -k infrastructure/kubernetes/overlays/prod -``` - -### 3. Verify Pods Can Pull Images - -```bash -# Check pod status -kubectl get pods -n bakery-ia - -# Check events for image pull status -kubectl get events -n bakery-ia --sort-by='.lastTimestamp' - -# Describe a specific pod to see image pull details -kubectl describe pod -n bakery-ia -``` - -## Manual Setup - -If you prefer to create the secret manually: - -```bash -kubectl create secret docker-registry dockerhub-creds \ - --docker-server=docker.io \ - --docker-username=uals \ - --docker-password=dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A \ - --docker-email=ualfaro@gmail.com \ - -n bakery-ia -``` - -Repeat for other namespaces: -```bash -kubectl create secret docker-registry dockerhub-creds \ - --docker-server=docker.io \ - --docker-username=uals \ - --docker-password=dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A \ - --docker-email=ualfaro@gmail.com \ - -n bakery-ia-dev - -kubectl create secret docker-registry dockerhub-creds \ - --docker-server=docker.io \ - --docker-username=uals \ - --docker-password=dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A \ - --docker-email=ualfaro@gmail.com \ - -n bakery-ia-prod -``` - -## What Was Changed - -### 1. Kubernetes Manifests (47 files updated) - -All deployments, jobs, and cronjobs now include `imagePullSecrets`: - -```yaml -spec: - template: - spec: - imagePullSecrets: - - name: dockerhub-creds - containers: - - name: ... -``` - -**Files Updated:** -- **19 Service Deployments**: All microservices (auth, tenant, forecasting, etc.) -- **21 Database Deployments**: All PostgreSQL instances, Redis, RabbitMQ -- **21 Migration Jobs**: All database migration jobs -- **2 CronJobs**: demo-cleanup, external-data-rotation -- **2 Standalone Jobs**: external-data-init, nominatim-init -- **1 Worker Deployment**: demo-cleanup-worker - -### 2. Tiltfile Configuration - -The Tiltfile now supports both local registry and Docker Hub: - -**Default (Local Registry):** -```bash -tilt up -``` - -**Docker Hub Mode:** -```bash -export USE_DOCKERHUB=true -export DOCKERHUB_USERNAME=uals -tilt up -``` - -### 3. Scripts - -Two new scripts were created: - -1. **[setup-dockerhub-secrets.sh](../infrastructure/kubernetes/setup-dockerhub-secrets.sh)** - - Creates Docker Hub secrets in all namespaces - - Idempotent (safe to run multiple times) - -2. **[add-image-pull-secrets.sh](../infrastructure/kubernetes/add-image-pull-secrets.sh)** - - Adds `imagePullSecrets` to all Kubernetes manifests - - Already run (no need to run again unless adding new manifests) - -## Using Docker Hub with Tilt - -To use Docker Hub for development with Tilt: - -```bash -# Login to Docker Hub first -docker login -u uals - -# Enable Docker Hub mode -export USE_DOCKERHUB=true -export DOCKERHUB_USERNAME=uals - -# Start Tilt -tilt up -``` - -This will: -- Build images locally -- Tag them as `docker.io/uals/` -- Push them to Docker Hub -- Deploy to Kubernetes with imagePullSecrets - -## Images Configuration - -### Base Images (from Docker Hub) - -These images are pulled from Docker Hub's public registry: - -- `python:3.11-slim` - Python base for all microservices -- `node:18-alpine` - Node.js for frontend builder -- `nginx:1.25-alpine` - Nginx for frontend production -- `postgres:17-alpine` - PostgreSQL databases -- `redis:7.4-alpine` - Redis cache -- `rabbitmq:4.1-management-alpine` - RabbitMQ message broker -- `busybox:latest` - Utility container -- `curlimages/curl:latest` - Curl utility -- `mediagis/nominatim:4.4` - Geolocation service - -### Custom Images (bakery/*) - -These images are built by the project: - -**Infrastructure:** -- `bakery/gateway` -- `bakery/dashboard` - -**Core Services:** -- `bakery/auth-service` -- `bakery/tenant-service` - -**Data & Analytics:** -- `bakery/training-service` -- `bakery/forecasting-service` -- `bakery/ai-insights-service` - -**Operations:** -- `bakery/sales-service` -- `bakery/inventory-service` -- `bakery/production-service` -- `bakery/procurement-service` -- `bakery/distribution-service` - -**Supporting:** -- `bakery/recipes-service` -- `bakery/suppliers-service` -- `bakery/pos-service` -- `bakery/orders-service` -- `bakery/external-service` - -**Platform:** -- `bakery/notification-service` -- `bakery/alert-processor` -- `bakery/orchestrator-service` - -**Demo:** -- `bakery/demo-session-service` - -## Pushing Custom Images to Docker Hub - -Use the existing tag-and-push script: - -```bash -# Login first -docker login -u uals - -# Tag and push all images -./scripts/tag-and-push-images.sh -``` - -Or manually for a specific image: - -```bash -# Build -docker build -t bakery/auth-service:latest -f services/auth/Dockerfile . - -# Tag for Docker Hub -docker tag bakery/auth-service:latest uals/bakery-auth-service:latest - -# Push -docker push uals/bakery-auth-service:latest -``` - -## Troubleshooting - -### Problem: ImagePullBackOff error - -Check if the secret exists: -```bash -kubectl get secret dockerhub-creds -n bakery-ia -``` - -Verify secret is correctly configured: -```bash -kubectl get secret dockerhub-creds -n bakery-ia -o yaml -``` - -Check pod events: -```bash -kubectl describe pod -n bakery-ia -``` - -### Problem: Authentication failure - -The Docker Hub credentials might be incorrect or expired. Update the secret: - -```bash -# Delete old secret -kubectl delete secret dockerhub-creds -n bakery-ia - -# Create new secret with updated credentials -kubectl create secret docker-registry dockerhub-creds \ - --docker-server=docker.io \ - --docker-username= \ - --docker-password= \ - --docker-email= \ - -n bakery-ia -``` - -### Problem: Pod still using old credentials - -Restart the pod to pick up the new secret: - -```bash -kubectl rollout restart deployment/ -n bakery-ia -``` - -## Security Best Practices - -1. **Use Docker Hub Access Tokens** (not passwords) - - Create at: https://hub.docker.com/settings/security - - Set appropriate permissions (Read-only for pulls) - -2. **Rotate Credentials Regularly** - - Update the secret every 90 days - - Use the setup script for consistent updates - -3. **Limit Secret Access** - - Only grant access to necessary namespaces - - Use RBAC to control who can read secrets - -4. **Monitor Usage** - - Check Docker Hub pull rate limits - - Monitor for unauthorized access - -## Rate Limits - -Docker Hub has rate limits for image pulls: - -- **Anonymous users**: 100 pulls per 6 hours per IP -- **Authenticated users**: 200 pulls per 6 hours -- **Pro/Team**: Unlimited - -Using authentication (imagePullSecrets) ensures you get the authenticated user rate limit. - -## Environment Variables - -For CI/CD or automated deployments, use these environment variables: - -```bash -export DOCKER_USERNAME=uals -export DOCKER_PASSWORD=dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A -export DOCKER_EMAIL=ualfaro@gmail.com -``` - -## Next Steps - -1. βœ… Docker Hub secret created in all namespaces -2. βœ… All Kubernetes manifests updated with imagePullSecrets -3. βœ… Tiltfile configured for optional Docker Hub usage -4. πŸ”„ Apply manifests to your cluster -5. πŸ”„ Verify pods can pull images successfully - -## Related Documentation - -- [Kubernetes Setup Guide](./KUBERNETES_SETUP.md) -- [Security Implementation](./SECURITY_IMPLEMENTATION_COMPLETE.md) -- [Tilt Development Workflow](../Tiltfile) - -## Support - -If you encounter issues: - -1. Check the troubleshooting section above -2. Verify Docker Hub credentials at: https://hub.docker.com/settings/security -3. Check Kubernetes events: `kubectl get events -A --sort-by='.lastTimestamp'` -4. Review pod logs: `kubectl logs -n bakery-ia ` diff --git a/docs/MONITORING_COMPLETE_GUIDE.md b/docs/MONITORING_COMPLETE_GUIDE.md deleted file mode 100644 index 84fc54f9..00000000 --- a/docs/MONITORING_COMPLETE_GUIDE.md +++ /dev/null @@ -1,449 +0,0 @@ -# Complete Monitoring Guide - Bakery IA Platform - -This guide provides the complete overview of observability implementation for the Bakery IA platform using SigNoz and OpenTelemetry. - -## 🎯 Executive Summary - -**What's Implemented:** -- βœ… **Distributed Tracing** - All 17 services -- βœ… **Application Metrics** - HTTP requests, latencies, errors -- βœ… **System Metrics** - CPU, memory, disk, network per service -- βœ… **Structured Logs** - With trace correlation -- βœ… **Database Monitoring** - PostgreSQL, Redis, RabbitMQ metrics -- βœ… **Pure OpenTelemetry** - No Prometheus, all OTLP push - -**Technology Stack:** -- **Backend**: OpenTelemetry Python SDK -- **Collector**: OpenTelemetry Collector (OTLP receivers) -- **Storage**: ClickHouse (traces, metrics, logs) -- **Frontend**: SigNoz UI -- **Protocol**: OTLP over HTTP/gRPC - -## πŸ“Š Architecture - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Application Services β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ auth β”‚ β”‚ inv β”‚ β”‚ orders β”‚ β”‚ ... β”‚ β”‚ -β”‚ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ Traces + Metrics + Logs β”‚ -β”‚ (OpenTelemetry OTLP) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Database Monitoring Collector β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ PG β”‚ β”‚ Redis β”‚ β”‚RabbitMQβ”‚ β”‚ -β”‚ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ Database Metrics β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz OpenTelemetry Collector β”‚ -β”‚ β”‚ -β”‚ Receivers: OTLP (gRPC :4317, HTTP :4318) β”‚ -β”‚ Processors: batch, memory_limiter, resourcedetection β”‚ -β”‚ Exporters: ClickHouse β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ ClickHouse Database β”‚ -β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Traces β”‚ β”‚ Metrics β”‚ β”‚ Logs β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz Frontend UI β”‚ -β”‚ https://monitoring.bakery-ia.local β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## πŸš€ Quick Start - -### 1. Deploy SigNoz - -```bash -# Add Helm repository -helm repo add signoz https://charts.signoz.io -helm repo update - -# Create namespace and install -kubectl create namespace signoz -helm install signoz signoz/signoz \ - -n signoz \ - -f infrastructure/helm/signoz-values-dev.yaml - -# Wait for pods -kubectl wait --for=condition=ready pod -l app=signoz -n signoz --timeout=300s -``` - -### 2. Deploy Services with Monitoring - -All services are already configured with OpenTelemetry environment variables. - -```bash -# Apply all services -kubectl apply -k infrastructure/kubernetes/overlays/dev/ - -# Or restart existing services -kubectl rollout restart deployment -n bakery-ia -``` - -### 3. Deploy Database Monitoring - -```bash -# Run the setup script -./infrastructure/kubernetes/setup-database-monitoring.sh - -# This will: -# - Create monitoring users in PostgreSQL -# - Deploy OpenTelemetry collector for database metrics -# - Start collecting PostgreSQL, Redis, RabbitMQ metrics -``` - -### 4. Access SigNoz UI - -```bash -# Via ingress -open https://monitoring.bakery-ia.local - -# Or port-forward -kubectl port-forward -n signoz svc/signoz-frontend 3301:3301 -open http://localhost:3301 -``` - -## πŸ“ˆ Metrics Collected - -### Application Metrics (Per Service) - -| Metric | Description | Type | -|--------|-------------|------| -| `http_requests_total` | Total HTTP requests | Counter | -| `http_request_duration_seconds` | Request latency | Histogram | -| `active_requests` | Current active requests | Gauge | - -### System Metrics (Per Service) - -| Metric | Description | Type | -|--------|-------------|------| -| `process.cpu.utilization` | Process CPU % | Gauge | -| `process.memory.usage` | Process memory bytes | Gauge | -| `process.memory.utilization` | Process memory % | Gauge | -| `process.threads.count` | Thread count | Gauge | -| `process.open_file_descriptors` | Open FDs (Unix) | Gauge | -| `system.cpu.utilization` | System CPU % | Gauge | -| `system.memory.usage` | System memory | Gauge | -| `system.memory.utilization` | System memory % | Gauge | -| `system.disk.io.read` | Disk read bytes | Counter | -| `system.disk.io.write` | Disk write bytes | Counter | -| `system.network.io.sent` | Network sent bytes | Counter | -| `system.network.io.received` | Network recv bytes | Counter | - -### PostgreSQL Metrics - -| Metric | Description | -|--------|-------------| -| `postgresql.backends` | Active connections | -| `postgresql.database.size` | Database size in bytes | -| `postgresql.commits` | Transaction commits | -| `postgresql.rollbacks` | Transaction rollbacks | -| `postgresql.deadlocks` | Deadlock count | -| `postgresql.blocks_read` | Blocks read from disk | -| `postgresql.table.size` | Table size | -| `postgresql.index.size` | Index size | - -### Redis Metrics - -| Metric | Description | -|--------|-------------| -| `redis.clients.connected` | Connected clients | -| `redis.commands.processed` | Commands processed | -| `redis.keyspace.hits` | Cache hits | -| `redis.keyspace.misses` | Cache misses | -| `redis.memory.used` | Memory usage | -| `redis.memory.fragmentation_ratio` | Fragmentation | -| `redis.db.keys` | Number of keys | - -### RabbitMQ Metrics - -| Metric | Description | -|--------|-------------| -| `rabbitmq.consumer.count` | Active consumers | -| `rabbitmq.message.current` | Messages in queue | -| `rabbitmq.message.acknowledged` | Messages ACKed | -| `rabbitmq.message.delivered` | Messages delivered | -| `rabbitmq.message.published` | Messages published | - -## πŸ” Traces - -**Automatic instrumentation for:** -- FastAPI endpoints -- HTTP client requests (HTTPX) -- Redis commands -- PostgreSQL queries (SQLAlchemy) -- RabbitMQ publish/consume - -**View traces:** -1. Go to **Services** tab in SigNoz -2. Select a service -3. View individual traces -4. Click trace β†’ See full span tree with timing - -## πŸ“ Logs - -**Features:** -- Structured logging with context -- Automatic trace-log correlation -- Searchable by service, level, message, custom fields - -**View logs:** -1. Go to **Logs** tab in SigNoz -2. Filter by service: `service_name="auth-service"` -3. Search for specific messages -4. Click log β†’ See full context including trace_id - -## πŸŽ›οΈ Configuration Files - -### Services - -All services configured in: -``` -infrastructure/kubernetes/base/components/*/\*-service.yaml -``` - -Each service has these environment variables: -```yaml -env: - - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_SERVICE_NAME - value: "service-name" - - name: ENABLE_TRACING - value: "true" - - name: OTEL_LOGS_EXPORTER - value: "otlp" - - name: ENABLE_OTEL_METRICS - value: "true" - - name: ENABLE_SYSTEM_METRICS - value: "true" -``` - -### SigNoz - -Configuration file: -``` -infrastructure/helm/signoz-values-dev.yaml -``` - -Key settings: -- OTLP receivers on ports 4317 (gRPC) and 4318 (HTTP) -- No Prometheus scraping (pure OTLP push) -- ClickHouse backend for storage -- Reduced resources for development - -### Database Monitoring - -Deployment file: -``` -infrastructure/kubernetes/base/monitoring/database-otel-collector.yaml -``` - -Setup script: -``` -infrastructure/kubernetes/setup-database-monitoring.sh -``` - -## πŸ“š Documentation - -| Document | Description | -|----------|-------------| -| [MONITORING_QUICKSTART.md](./MONITORING_QUICKSTART.md) | 10-minute quick start guide | -| [MONITORING_SETUP.md](./MONITORING_SETUP.md) | Detailed setup and troubleshooting | -| [DATABASE_MONITORING.md](./DATABASE_MONITORING.md) | Database metrics and logs guide | -| This document | Complete overview | - -## πŸ”§ Shared Libraries - -### Monitoring Modules - -Located in `shared/monitoring/`: - -| File | Purpose | -|------|---------| -| `__init__.py` | Package exports | -| `logging.py` | Standard logging setup | -| `logs_exporter.py` | OpenTelemetry logs export | -| `metrics.py` | OpenTelemetry metrics (no Prometheus) | -| `metrics_exporter.py` | OTLP metrics export setup | -| `system_metrics.py` | System metrics collection (CPU, memory, etc.) | -| `tracing.py` | Distributed tracing setup | -| `health_checks.py` | Health check endpoints | - -### Usage in Services - -```python -from shared.service_base import StandardFastAPIService - -# Create service -service = AuthService() - -# Create app with auto-configured monitoring -app = service.create_app() - -# Monitoring is automatically enabled: -# - Tracing (if ENABLE_TRACING=true) -# - Metrics (if ENABLE_OTEL_METRICS=true) -# - System metrics (if ENABLE_SYSTEM_METRICS=true) -# - Logs (if OTEL_LOGS_EXPORTER=otlp) -``` - -## 🎨 Dashboard Examples - -### Service Health Dashboard - -Create a dashboard with: -1. **Request Rate** - `rate(http_requests_total[5m])` -2. **Error Rate** - `rate(http_requests_total{status_code=~"5.."}[5m])` -3. **Latency (P95)** - `histogram_quantile(0.95, http_request_duration_seconds)` -4. **Active Requests** - `active_requests` -5. **CPU Usage** - `process.cpu.utilization` -6. **Memory Usage** - `process.memory.utilization` - -### Database Dashboard - -1. **PostgreSQL Connections** - `postgresql.backends` -2. **Database Size** - `postgresql.database.size` -3. **Transaction Rate** - `rate(postgresql.commits[5m])` -4. **Redis Hit Rate** - `redis.keyspace.hits / (redis.keyspace.hits + redis.keyspace.misses)` -5. **RabbitMQ Queue Depth** - `rabbitmq.message.current` - -## ⚠️ Alerts - -### Recommended Alerts - -**Application:** -- High error rate (>5% of requests failing) -- High latency (P95 > 1s) -- Service down (no metrics for 5 minutes) - -**System:** -- High CPU (>80% for 5 minutes) -- High memory (>90%) -- Disk space low (<10%) - -**Database:** -- PostgreSQL connections near max (>80% of max_connections) -- Slow queries (>5s) -- Redis memory high (>80%) -- RabbitMQ queue buildup (>10k messages) - -## πŸ› Troubleshooting - -### No Data in SigNoz - -```bash -# 1. Check service logs -kubectl logs -n bakery-ia deployment/auth-service | grep -i otel - -# 2. Check SigNoz collector -kubectl logs -n signoz deployment/signoz-otel-collector - -# 3. Test connectivity -kubectl exec -n bakery-ia deployment/auth-service -- \ - curl -v http://signoz-otel-collector.signoz.svc.cluster.local:4318 -``` - -### Database Metrics Missing - -```bash -# Check database monitoring collector -kubectl logs -n bakery-ia deployment/database-otel-collector - -# Verify monitoring user exists -kubectl exec -n bakery-ia deployment/auth-db -- \ - psql -U postgres -c "\du otel_monitor" -``` - -### Traces Not Correlated with Logs - -Ensure `OTEL_LOGS_EXPORTER=otlp` is set in service environment variables. - -## 🎯 Best Practices - -1. **Always use structured logging** - Add context with key-value pairs -2. **Add custom spans** - For important business operations -3. **Set appropriate log levels** - INFO for production, DEBUG for dev -4. **Monitor your monitors** - Alert on collector failures -5. **Regular retention policy reviews** - Balance cost vs. data retention -6. **Create service dashboards** - One dashboard per service -7. **Set up critical alerts first** - Service down, high error rate -8. **Document custom metrics** - Explain business-specific metrics - -## πŸ“Š Performance Impact - -**Resource Usage (per service):** -- CPU: +5-10% (instrumentation overhead) -- Memory: +50-100MB (SDK and buffers) -- Network: Minimal (batched export every 60s) - -**Latency Impact:** -- Per request: <1ms (async instrumentation) -- No impact on user-facing latency - -**Storage (SigNoz):** -- Traces: ~1GB per million requests -- Metrics: ~100MB per service per day -- Logs: Varies by log volume - -## πŸ” Security Considerations - -1. **Use dedicated monitoring users** - Never use app credentials -2. **Limit collector permissions** - Read-only access to databases -3. **Secure OTLP endpoints** - Use TLS in production -4. **Sanitize sensitive data** - Don't log passwords, tokens -5. **Network policies** - Restrict collector network access -6. **RBAC** - Limit SigNoz UI access per team - -## πŸš€ Next Steps - -1. **Deploy to production** - Update production SigNoz config -2. **Create team dashboards** - Per-service and system-wide views -3. **Set up alerts** - Start with critical service health alerts -4. **Train team** - SigNoz UI usage, query language -5. **Document runbooks** - How to respond to alerts -6. **Optimize retention** - Based on actual data volume -7. **Add custom metrics** - Business-specific KPIs - -## πŸ“ž Support - -- **SigNoz Community**: https://signoz.io/slack -- **OpenTelemetry Docs**: https://opentelemetry.io/docs/ -- **Internal Docs**: See /docs folder - -## πŸ“ Change Log - -| Date | Change | -|------|--------| -| 2026-01-08 | Initial implementation - All services configured | -| 2026-01-08 | Database monitoring added (PostgreSQL, Redis, RabbitMQ) | -| 2026-01-08 | System metrics collection implemented | -| 2026-01-08 | Removed Prometheus, pure OpenTelemetry | - ---- - -**Congratulations! Your platform now has complete observability. πŸŽ‰** - -Every request is traced, every metric is collected, every log is searchable. diff --git a/docs/MONITORING_DOCUMENTATION.md b/docs/MONITORING_DOCUMENTATION.md new file mode 100644 index 00000000..8dea4b38 --- /dev/null +++ b/docs/MONITORING_DOCUMENTATION.md @@ -0,0 +1,536 @@ +# πŸ“Š Bakery-ia Monitoring System Documentation + +## 🎯 Overview + +The bakery-ia platform features a comprehensive, modern monitoring system built on **OpenTelemetry** and **SigNoz**. This documentation provides a complete guide to the monitoring architecture, setup, and usage. + +## πŸš€ Monitoring Architecture + +### Core Components + +```mermaid +graph TD + A[Microservices] -->|OTLP| B[OpenTelemetry Collector] + B -->|gRPC| C[SigNoz] + C --> D[Traces Dashboard] + C --> E[Metrics Dashboard] + C --> F[Logs Dashboard] + C --> G[Alerts] +``` + +### Technology Stack + +- **Instrumentation**: OpenTelemetry Python SDK +- **Protocol**: OTLP (OpenTelemetry Protocol) over gRPC +- **Backend**: SigNoz (open-source observability platform) +- **Metrics**: Prometheus-compatible metrics via OTLP +- **Traces**: Jaeger-compatible tracing via OTLP +- **Logs**: Structured logging with trace correlation + +## πŸ“‹ Monitoring Coverage + +### Service Coverage (100%) + +| Service Category | Services | Monitoring Type | Status | +|-----------------|----------|----------------|--------| +| **Critical Services** | auth, orders, sales, external | Base Class | βœ… Monitored | +| **AI Services** | ai-insights, training | Direct | βœ… Monitored | +| **Data Services** | inventory, procurement, production, forecasting | Base Class | βœ… Monitored | +| **Operational Services** | tenant, notification, distribution | Base Class | βœ… Monitored | +| **Specialized Services** | suppliers, pos, recipes, orchestrator | Base Class | βœ… Monitored | +| **Infrastructure** | gateway, alert-processor, demo-session | Direct | βœ… Monitored | + +**Total: 20 services with 100% monitoring coverage** + +## πŸ”§ Monitoring Implementation + +### Implementation Patterns + +#### 1. Base Class Pattern (16 services) + +Services using `StandardFastAPIService` inherit comprehensive monitoring: + +```python +from shared.service_base import StandardFastAPIService + +class MyService(StandardFastAPIService): + def __init__(self): + super().__init__( + service_name="my-service", + app_name="My Service", + description="Service description", + version="1.0.0", + # Monitoring enabled by default + enable_metrics=True, # βœ… Metrics collection + enable_tracing=True, # βœ… Distributed tracing + enable_health_checks=True # βœ… Health endpoints + ) +``` + +#### 2. Direct Pattern (4 services) + +Critical services with custom monitoring needs: + +```python +# services/ai_insights/app/main.py +from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +from shared.monitoring.system_metrics import SystemMetricsCollector + +# Initialize metrics collectors +metrics_collector = MetricsCollector("ai-insights") +system_metrics = SystemMetricsCollector("ai-insights") + +# Add middleware +add_metrics_middleware(app, metrics_collector) +``` + +### Monitoring Components + +#### OpenTelemetry Instrumentation + +```python +# Automatic instrumentation in base class +FastAPIInstrumentor.instrument_app(app) # HTTP requests +HTTPXClientInstrumentor().instrument() # Outgoing HTTP +RedisInstrumentor().instrument() # Redis operations +SQLAlchemyInstrumentor().instrument() # Database queries +``` + +#### Metrics Collection + +```python +# Standard metrics automatically collected +metrics_collector.register_counter("http_requests_total", "Total HTTP requests") +metrics_collector.register_histogram("http_request_duration", "Request duration") +metrics_collector.register_gauge("active_requests", "Active requests") + +# System metrics automatically collected +system_metrics = SystemMetricsCollector("service-name") +# β†’ CPU, Memory, Disk I/O, Network I/O, Threads, File Descriptors +``` + +#### Health Checks + +```python +# Automatic health check endpoints +GET /health # Overall service health +GET /health/detailed # Detailed health with dependencies +GET /health/ready # Readiness probe +GET /health/live # Liveness probe +``` + +## πŸ“Š Metrics Reference + +### Standard Metrics (All Services) + +| Metric Type | Metric Name | Description | Labels | +|-------------|------------|-------------|--------| +| **HTTP Metrics** | `{service}_http_requests_total` | Total HTTP requests | method, endpoint, status_code | +| **HTTP Metrics** | `{service}_http_request_duration_seconds` | Request duration histogram | method, endpoint, status_code | +| **HTTP Metrics** | `{service}_active_requests` | Currently active requests | - | +| **System Metrics** | `process.cpu.utilization` | Process CPU usage | - | +| **System Metrics** | `process.memory.usage` | Process memory usage | - | +| **System Metrics** | `system.cpu.utilization` | System CPU usage | - | +| **System Metrics** | `system.memory.usage` | System memory usage | - | +| **Database Metrics** | `db.query.duration` | Database query duration | operation, table | +| **Cache Metrics** | `cache.operation.duration` | Cache operation duration | operation, key | + +### Custom Metrics (Service-Specific) + +Examples of service-specific metrics: + +**Auth Service:** +- `auth_registration_total` (by status) +- `auth_login_success_total` +- `auth_login_failure_total` (by reason) +- `auth_registration_duration_seconds` + +**Orders Service:** +- `orders_created_total` +- `orders_processed_total` (by status) +- `orders_processing_duration_seconds` + +**AI Insights Service:** +- `ai_insights_generated_total` +- `ai_model_inference_duration_seconds` +- `ai_feedback_received_total` + +## πŸ” Tracing Guide + +### Trace Propagation + +Traces automatically flow across service boundaries: + +```mermaid +sequenceDiagram + participant Client + participant Gateway + participant Auth + participant Orders + + Client->>Gateway: HTTP Request (trace_id: abc123) + Gateway->>Auth: Auth Check (trace_id: abc123) + Auth-->>Gateway: Auth Response (trace_id: abc123) + Gateway->>Orders: Create Order (trace_id: abc123) + Orders-->>Gateway: Order Created (trace_id: abc123) + Gateway-->>Client: Final Response (trace_id: abc123) +``` + +### Trace Context in Logs + +All logs include trace correlation: + +```json +{ + "level": "info", + "message": "Processing order", + "service": "orders-service", + "trace_id": "abc123def456", + "span_id": "789ghi", + "order_id": "12345", + "timestamp": "2024-01-08T19:00:00Z" +} +``` + +### Manual Trace Enhancement + +Add custom trace attributes: + +```python +from shared.monitoring.tracing import add_trace_attributes, add_trace_event + +# Add custom attributes +add_trace_attributes( + user_id="123", + tenant_id="abc", + operation="order_creation" +) + +# Add trace events +add_trace_event("order_validation_started") +# ... validation logic ... +add_trace_event("order_validation_completed", status="success") +``` + +## 🚨 Alerting Guide + +### Standard Alerts (Recommended) + +| Alert Name | Condition | Severity | Notification | +|------------|-----------|----------|--------------| +| **High Error Rate** | `error_rate > 5%` for 5m | High | PagerDuty + Slack | +| **High Latency** | `p99_latency > 2s` for 5m | High | PagerDuty + Slack | +| **Service Unavailable** | `up == 0` for 1m | Critical | PagerDuty + Slack + Email | +| **High Memory Usage** | `memory_usage > 80%` for 10m | Medium | Slack | +| **High CPU Usage** | `cpu_usage > 90%` for 5m | Medium | Slack | +| **Database Connection Issues** | `db_connections < minimum_pool_size` | High | PagerDuty + Slack | +| **Cache Hit Ratio Low** | `cache_hit_ratio < 70%` for 15m | Low | Slack | + +### Creating Alerts in SigNoz + +1. **Navigate to Alerts**: SigNoz UI β†’ Alerts β†’ Create Alert +2. **Select Metric**: Choose from available metrics +3. **Set Condition**: Define threshold and duration +4. **Configure Notifications**: Add notification channels +5. **Set Severity**: Critical, High, Medium, Low +6. **Add Description**: Explain alert purpose and resolution steps + +### Example Alert Configuration (YAML) + +```yaml +# Example for Terraform/Kubernetes +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: bakery-ia-alerts + namespace: monitoring +spec: + groups: + - name: service-health + rules: + - alert: ServiceDown + expr: up{service!~"signoz.*"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.service }} is down" + description: "{{ $labels.service }} has been down for more than 1 minute" + runbook: "https://github.com/yourorg/bakery-ia/blob/main/RUNBOOKS.md#service-down" + + - alert: HighErrorRate + expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: high + annotations: + summary: "High error rate in {{ $labels.service }}" + description: "Error rate is {{ $value }}% (threshold: 5%)" + runbook: "https://github.com/yourorg/bakery-ia/blob/main/RUNBOOKS.md#high-error-rate" +``` + +## πŸ“ˆ Dashboard Guide + +### Recommended Dashboards + +#### 1. Service Overview Dashboard +- HTTP Request Rate +- Error Rate +- Latency Percentiles (p50, p90, p99) +- Active Requests +- System Resource Usage + +#### 2. Performance Dashboard +- Request Duration Histogram +- Database Query Performance +- Cache Performance +- External API Call Performance + +#### 3. System Health Dashboard +- CPU Usage (Process & System) +- Memory Usage (Process & System) +- Disk I/O +- Network I/O +- File Descriptors +- Thread Count + +#### 4. Business Metrics Dashboard +- User Registrations +- Order Volume +- AI Insights Generated +- API Usage by Tenant + +### Creating Dashboards in SigNoz + +1. **Navigate to Dashboards**: SigNoz UI β†’ Dashboards β†’ Create Dashboard +2. **Add Panels**: Click "Add Panel" and select metric +3. **Configure Visualization**: Choose chart type and settings +4. **Set Time Range**: Default to last 1h, 6h, 24h, 7d +5. **Add Variables**: For dynamic filtering (service, environment) +6. **Save Dashboard**: Give it a descriptive name + +## πŸ› οΈ Troubleshooting Guide + +### Common Issues & Solutions + +#### Issue: No Metrics Appearing in SigNoz + +**Checklist:** +- βœ… OpenTelemetry Collector running? `kubectl get pods -n signoz` +- βœ… Service can reach collector? `telnet signoz-otel-collector.signoz 4318` +- βœ… OTLP endpoint configured correctly? Check `OTEL_EXPORTER_OTLP_ENDPOINT` +- βœ… Service logs show OTLP export? Look for "Exporting metrics" +- βœ… No network policies blocking? Check Kubernetes network policies + +**Debugging:** +```bash +# Check OpenTelemetry Collector logs +kubectl logs -n signoz -l app=otel-collector + +# Check service logs for OTLP errors +kubectl logs -l app=auth-service | grep -i otel + +# Test OTLP connectivity from service pod +kubectl exec -it auth-service-pod -- curl -v http://signoz-otel-collector.signoz:4318 +``` + +#### Issue: High Latency in Specific Service + +**Checklist:** +- βœ… Database queries slow? Check `db.query.duration` metrics +- βœ… External API calls slow? Check trace waterfall +- βœ… High CPU usage? Check system metrics +- βœ… Memory pressure? Check memory metrics +- βœ… Too many active requests? Check concurrency + +**Debugging:** +```python +# Add detailed tracing to suspicious code +from shared.monitoring.tracing import add_trace_event + +add_trace_event("database_query_started", table="users") +# ... database query ... +add_trace_event("database_query_completed", duration_ms=45) +``` + +#### Issue: High Error Rate + +**Checklist:** +- βœ… Database connection issues? Check health endpoints +- βœ… External API failures? Check dependency metrics +- βœ… Authentication failures? Check auth service logs +- βœ… Validation errors? Check application logs +- βœ… Rate limiting? Check gateway metrics + +**Debugging:** +```bash +# Check error logs with trace correlation +kubectl logs -l app=auth-service | grep -i error | grep -i trace + +# Filter traces by error status +# In SigNoz: Add filter http.status_code >= 400 +``` + +## πŸ“š Runbook Reference + +See [RUNBOOKS.md](RUNBOOKS.md) for detailed troubleshooting procedures. + +## πŸ”§ Development Guide + +### Adding Custom Metrics + +```python +# In any service using direct monitoring +self.metrics_collector.register_counter( + "custom_metric_name", + "Description of what this metric tracks", + labels=["label1", "label2"] # Optional labels +) + +# Increment the counter +self.metrics_collector.increment_counter( + "custom_metric_name", + value=1, + labels={"label1": "value1", "label2": "value2"} +) +``` + +### Adding Custom Trace Attributes + +```python +# Add context to current span +from shared.monitoring.tracing import add_trace_attributes + +add_trace_attributes( + user_id=user.id, + tenant_id=tenant.id, + operation="premium_feature_access", + feature_name="advanced_forecasting" +) +``` + +### Service-Specific Monitoring Setup + +For services needing custom monitoring beyond the base class: + +```python +# In your service's __init__ method +from shared.monitoring.system_metrics import SystemMetricsCollector +from shared.monitoring.metrics import MetricsCollector + +class MyService(StandardFastAPIService): + def __init__(self): + # Call parent constructor first + super().__init__(...) + + # Add custom metrics collector + self.custom_metrics = MetricsCollector("my-service") + + # Register custom metrics + self.custom_metrics.register_counter( + "business_specific_events", + "Custom business event counter" + ) + + # Add system metrics if not using base class defaults + self.system_metrics = SystemMetricsCollector("my-service") +``` + +## πŸ“Š SigNoz Configuration + +### Environment Variables + +```env +# OpenTelemetry Collector endpoint +OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector.signoz:4318 + +# Service-specific configuration +OTEL_SERVICE_NAME=auth-service +OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production,k8s.namespace=bakery-ia + +# Metrics export interval (default: 60000ms = 60s) +OTEL_METRIC_EXPORT_INTERVAL=60000 + +# Batch span processor configuration +OTEL_BSP_SCHEDULE_DELAY=5000 +OTEL_BSP_MAX_QUEUE_SIZE=2048 +OTEL_BSP_MAX_EXPORT_BATCH_SIZE=512 +``` + +### Kubernetes Configuration + +```yaml +# Example deployment with monitoring sidecar +apiVersion: apps/v1 +kind: Deployment +metadata: + name: auth-service +spec: + template: + spec: + containers: + - name: auth-service + image: auth-service:latest + env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://signoz-otel-collector.signoz:4318" + - name: OTEL_SERVICE_NAME + value: "auth-service" + - name: ENVIRONMENT + value: "production" + resources: + limits: + cpu: "1" + memory: "512Mi" + requests: + cpu: "200m" + memory: "256Mi" +``` + +## 🎯 Best Practices + +### Monitoring Best Practices + +1. **Use Consistent Naming**: Follow OpenTelemetry semantic conventions +2. **Add Context to Traces**: Include user/tenant IDs in trace attributes +3. **Monitor Dependencies**: Track external API and database performance +4. **Set Appropriate Alerts**: Avoid alert fatigue with meaningful thresholds +5. **Document Metrics**: Keep metrics documentation up to date +6. **Review Regularly**: Update dashboards as services evolve +7. **Test Alerts**: Ensure alerts fire correctly before production + +### Performance Best Practices + +1. **Batch Metrics Export**: Use default 60s interval for most services +2. **Sample Traces**: Consider sampling for high-volume services +3. **Limit Custom Metrics**: Only track metrics that provide value +4. **Use Histograms Wisely**: Histograms can be resource-intensive +5. **Monitor Monitoring**: Track OTLP export success/failure rates + +## πŸ“ž Support + +### Getting Help + +1. **Check Documentation**: This file and RUNBOOKS.md +2. **Review SigNoz Docs**: https://signoz.io/docs/ +3. **OpenTelemetry Docs**: https://opentelemetry.io/docs/ +4. **Team Channel**: #monitoring in Slack +5. **GitHub Issues**: https://github.com/yourorg/bakery-ia/issues + +### Escalation Path + +1. **First Line**: Development team (service owners) +2. **Second Line**: DevOps team (monitoring specialists) +3. **Third Line**: SigNoz support (vendor support) + +## πŸŽ‰ Summary + +The bakery-ia monitoring system provides: + +- **πŸ“Š 100% Service Coverage**: All 20 services monitored +- **πŸš€ Modern Architecture**: OpenTelemetry + SigNoz +- **πŸ”§ Comprehensive Metrics**: System, HTTP, database, cache +- **πŸ” Full Observability**: Traces, metrics, logs integrated +- **βœ… Production Ready**: Battle-tested and scalable + +**All services are fully instrumented and ready for production monitoring!** πŸŽ‰ \ No newline at end of file diff --git a/docs/MONITORING_QUICKSTART.md b/docs/MONITORING_QUICKSTART.md deleted file mode 100644 index 755f70d8..00000000 --- a/docs/MONITORING_QUICKSTART.md +++ /dev/null @@ -1,283 +0,0 @@ -# SigNoz Monitoring Quick Start - -Get complete observability (metrics, logs, traces, system metrics) in under 10 minutes using OpenTelemetry. - -## What You'll Get - -βœ… **Distributed Tracing** - Complete request flows across all services -βœ… **Application Metrics** - HTTP requests, durations, error rates, custom business metrics -βœ… **System Metrics** - CPU usage, memory usage, disk I/O, network I/O per service -βœ… **Structured Logs** - Searchable logs correlated with traces -βœ… **Unified Dashboard** - Single UI for all telemetry data - -**All data pushed via OpenTelemetry OTLP protocol - No Prometheus, no scraping needed!** - -## Prerequisites - -- Kubernetes cluster running (Kind/Minikube/Production) -- Helm 3.x installed -- kubectl configured - -## Step 1: Deploy SigNoz - -```bash -# Add Helm repository -helm repo add signoz https://charts.signoz.io -helm repo update - -# Create namespace -kubectl create namespace signoz - -# Install SigNoz -helm install signoz signoz/signoz \ - -n signoz \ - -f infrastructure/helm/signoz-values-dev.yaml - -# Wait for pods to be ready (2-3 minutes) -kubectl wait --for=condition=ready pod -l app=signoz -n signoz --timeout=300s -``` - -## Step 2: Configure Services - -Each service needs OpenTelemetry environment variables. The auth-service is already configured as an example. - -### Quick Configuration (for remaining services) - -Add these environment variables to each service deployment: - -```yaml -env: - # OpenTelemetry Collector endpoint - - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_SERVICE_NAME - value: "your-service-name" # e.g., "inventory-service" - - # Enable tracing - - name: ENABLE_TRACING - value: "true" - - # Enable logs export - - name: OTEL_LOGS_EXPORTER - value: "otlp" - - # Enable metrics export (includes system metrics) - - name: ENABLE_OTEL_METRICS - value: "true" - - name: ENABLE_SYSTEM_METRICS - value: "true" -``` - -### Using the Configuration Script - -```bash -# Generate configuration patches for all services -./infrastructure/kubernetes/add-monitoring-config.sh - -# This creates /tmp/*-otel-patch.yaml files -# Review and manually add to each service deployment -``` - -## Step 3: Deploy Updated Services - -```bash -# Apply updated configurations -kubectl apply -k infrastructure/kubernetes/overlays/dev/ - -# Or restart services to pick up new env vars -kubectl rollout restart deployment -n bakery-ia - -# Wait for rollout -kubectl rollout status deployment -n bakery-ia --timeout=5m -``` - -## Step 4: Access SigNoz UI - -### Via Ingress - -```bash -# Add to /etc/hosts if needed -echo "127.0.0.1 monitoring.bakery-ia.local" | sudo tee -a /etc/hosts - -# Access UI -open https://monitoring.bakery-ia.local -``` - -### Via Port Forward - -```bash -kubectl port-forward -n signoz svc/signoz-frontend 3301:3301 -open http://localhost:3301 -``` - -## Step 5: Explore Your Data - -### Traces - -1. Go to **Services** tab -2. See all your services listed -3. Click on a service β†’ View traces -4. Click on a trace β†’ See detailed span tree with timing - -### Metrics - -**HTTP Metrics** (automatically collected): -- `http_requests_total` - Total requests by method, endpoint, status -- `http_request_duration_seconds` - Request latency -- `active_requests` - Current active HTTP requests - -**System Metrics** (automatically collected per service): -- `process.cpu.utilization` - Process CPU usage % -- `process.memory.usage` - Process memory in bytes -- `process.memory.utilization` - Process memory % -- `process.threads.count` - Number of threads -- `system.cpu.utilization` - System-wide CPU % -- `system.memory.usage` - System memory usage -- `system.disk.io.read` - Disk bytes read -- `system.disk.io.write` - Disk bytes written -- `system.network.io.sent` - Network bytes sent -- `system.network.io.received` - Network bytes received - -**Custom Business Metrics** (if configured): -- User registrations -- Orders created -- Login attempts -- etc. - -### Logs - -1. Go to **Logs** tab -2. Filter by service: `service_name="auth-service"` -3. Search for specific messages -4. See structured fields (user_id, tenant_id, etc.) - -### Trace-Log Correlation - -1. Find a trace in **Traces** tab -2. Note the `trace_id` -3. Go to **Logs** tab -4. Filter: `trace_id=""` -5. See all logs for that specific request! - -## Verification Commands - -```bash -# Check if services are sending telemetry -kubectl logs -n bakery-ia deployment/auth-service | grep -i "telemetry\|otel" - -# Check SigNoz collector is receiving data -kubectl logs -n signoz deployment/signoz-otel-collector | tail -50 - -# Test connectivity to collector -kubectl exec -n bakery-ia deployment/auth-service -- \ - curl -v http://signoz-otel-collector.signoz.svc.cluster.local:4318 -``` - -## Common Issues - -### No data in SigNoz - -```bash -# 1. Verify environment variables are set -kubectl get deployment auth-service -n bakery-ia -o yaml | grep OTEL - -# 2. Check collector logs -kubectl logs -n signoz deployment/signoz-otel-collector - -# 3. Restart service -kubectl rollout restart deployment/auth-service -n bakery-ia -``` - -### Services not appearing - -```bash -# Check network connectivity -kubectl exec -n bakery-ia deployment/auth-service -- \ - curl http://signoz-otel-collector.signoz.svc.cluster.local:4318 - -# Should return: connection successful (not connection refused) -``` - -## Architecture - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Your Microservices β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ auth β”‚ β”‚ inv β”‚ β”‚ordersβ”‚ ... β”‚ -β”‚ β””β”€β”€β”¬β”€β”€β”€β”˜ β””β”€β”€β”¬β”€β”€β”€β”˜ β””β”€β”€β”¬β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ OTLP Push β”‚ -β”‚ (traces, metrics, logs) β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz OpenTelemetry Collector β”‚ -β”‚ :4317 (gRPC) :4318 (HTTP) β”‚ -β”‚ β”‚ -β”‚ Receivers: OTLP only (no Prometheus) β”‚ -β”‚ Processors: batch, memory_limiter β”‚ -β”‚ Exporters: ClickHouse β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ ClickHouse Database β”‚ -β”‚ Stores: traces, metrics, logs β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz Frontend UI β”‚ -β”‚ monitoring.bakery-ia.local or :3301 β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## What Makes This Different - -**Pure OpenTelemetry** - No Prometheus involved: -- βœ… All metrics pushed via OTLP (not scraped) -- βœ… Automatic system metrics collection (CPU, memory, disk, network) -- βœ… Unified data model for all telemetry -- βœ… Native trace-metric-log correlation -- βœ… Lower resource usage (no scraping overhead) - -## Next Steps - -- **Create Dashboards** - Build custom views for your metrics -- **Set Up Alerts** - Configure alerts for errors, latency, resource usage -- **Explore System Metrics** - Monitor CPU, memory per service -- **Query Logs** - Use powerful log query language -- **Correlate Everything** - Jump from traces β†’ logs β†’ metrics - -## Need Help? - -- [Full Documentation](./MONITORING_SETUP.md) - Detailed setup guide -- [SigNoz Docs](https://signoz.io/docs/) - Official documentation -- [OpenTelemetry Python](https://opentelemetry.io/docs/instrumentation/python/) - Python instrumentation - ---- - -**Metrics You Get Out of the Box:** - -| Category | Metrics | Description | -|----------|---------|-------------| -| HTTP | `http_requests_total` | Total requests by method, endpoint, status | -| HTTP | `http_request_duration_seconds` | Request latency histogram | -| HTTP | `active_requests` | Current active requests | -| Process | `process.cpu.utilization` | Process CPU usage % | -| Process | `process.memory.usage` | Process memory in bytes | -| Process | `process.memory.utilization` | Process memory % | -| Process | `process.threads.count` | Thread count | -| System | `system.cpu.utilization` | System CPU % | -| System | `system.memory.usage` | System memory usage | -| System | `system.memory.utilization` | System memory % | -| Disk | `system.disk.io.read` | Disk read bytes | -| Disk | `system.disk.io.write` | Disk write bytes | -| Network | `system.network.io.sent` | Network sent bytes | -| Network | `system.network.io.received` | Network received bytes | diff --git a/docs/MONITORING_SETUP.md b/docs/MONITORING_SETUP.md deleted file mode 100644 index 2445b228..00000000 --- a/docs/MONITORING_SETUP.md +++ /dev/null @@ -1,511 +0,0 @@ -# SigNoz Monitoring Setup Guide - -This guide explains how to set up complete observability for the Bakery IA platform using SigNoz, which provides unified metrics, logs, and traces visualization. - -## Table of Contents - -1. [Architecture Overview](#architecture-overview) -2. [Prerequisites](#prerequisites) -3. [SigNoz Deployment](#signoz-deployment) -4. [Service Configuration](#service-configuration) -5. [Data Flow](#data-flow) -6. [Verification](#verification) -7. [Troubleshooting](#troubleshooting) - -## Architecture Overview - -The monitoring setup uses a three-tier approach: - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ Bakery IA Services β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Auth β”‚ β”‚ Inventoryβ”‚ β”‚ Orders β”‚ β”‚ ... β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ OpenTelemetry Protocol (OTLP) β”‚ -β”‚ Traces / Metrics / Logs β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz OpenTelemetry Collector β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Receivers: β”‚ β”‚ -β”‚ β”‚ - OTLP gRPC (4317) - OTLP HTTP (4318) β”‚ β”‚ -β”‚ β”‚ - Prometheus Scraper (service discovery) β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Processors: batch, memory_limiter, resourcedetection β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ β”‚ β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Exporters: ClickHouse (traces, metrics, logs) β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ ClickHouse Database β”‚ -β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ -β”‚ β”‚ Traces β”‚ β”‚ Metrics β”‚ β”‚ Logs β”‚ β”‚ -β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β–Ό -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ SigNoz Query Service β”‚ -β”‚ & Frontend UI β”‚ -β”‚ https://monitoring.bakery-ia.local β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -### Key Components - -1. **Services**: Generate telemetry data using OpenTelemetry SDK -2. **OpenTelemetry Collector**: Receives, processes, and exports telemetry -3. **ClickHouse**: Stores traces, metrics, and logs -4. **SigNoz UI**: Query and visualize all telemetry data - -## Prerequisites - -- Kubernetes cluster (Kind, Minikube, or production cluster) -- Helm 3.x installed -- kubectl configured -- At least 4GB RAM available for SigNoz components - -## SigNoz Deployment - -### 1. Add SigNoz Helm Repository - -```bash -helm repo add signoz https://charts.signoz.io -helm repo update -``` - -### 2. Create Namespace - -```bash -kubectl create namespace signoz -``` - -### 3. Deploy SigNoz - -```bash -# For development environment -helm install signoz signoz/signoz \ - -n signoz \ - -f infrastructure/helm/signoz-values-dev.yaml - -# For production environment -helm install signoz signoz/signoz \ - -n signoz \ - -f infrastructure/helm/signoz-values-prod.yaml -``` - -### 4. Verify Deployment - -```bash -# Check all pods are running -kubectl get pods -n signoz - -# Expected output: -# signoz-alertmanager-0 -# signoz-clickhouse-0 -# signoz-frontend-* -# signoz-otel-collector-* -# signoz-query-service-* - -# Check services -kubectl get svc -n signoz -``` - -## Service Configuration - -Each microservice needs to be configured to send telemetry to SigNoz. - -### Environment Variables - -Add these environment variables to your service deployments: - -```yaml -env: - # OpenTelemetry Collector endpoint - - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - # Service identification - - name: OTEL_SERVICE_NAME - value: "your-service-name" # e.g., "auth-service" - - # Enable tracing - - name: ENABLE_TRACING - value: "true" - - # Enable logs export - - name: OTEL_LOGS_EXPORTER - value: "otlp" - - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED - value: "true" - - # Enable metrics export (optional, default: true) - - name: ENABLE_OTEL_METRICS - value: "true" -``` - -### Prometheus Annotations - -Add these annotations to enable Prometheus metrics scraping: - -```yaml -metadata: - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8000" - prometheus.io/path: "/metrics" -``` - -### Complete Example - -See [infrastructure/kubernetes/base/components/auth/auth-service.yaml](../infrastructure/kubernetes/base/components/auth/auth-service.yaml) for a complete example. - -### Automated Configuration Script - -Use the provided script to add monitoring configuration to all services: - -```bash -# Run from project root -./infrastructure/kubernetes/add-monitoring-config.sh -``` - -## Data Flow - -### 1. Traces - -**Automatic Instrumentation:** - -```python -# In your service's main.py -from shared.service_base import StandardFastAPIService - -service = AuthService() # Extends StandardFastAPIService -app = service.create_app() - -# Tracing is automatically enabled if ENABLE_TRACING=true -# All FastAPI endpoints, HTTP clients, Redis, PostgreSQL are auto-instrumented -``` - -**Manual Instrumentation:** - -```python -from shared.monitoring.tracing import add_trace_attributes, add_trace_event - -# Add custom attributes to current span -add_trace_attributes( - user_id="123", - tenant_id="abc", - operation="user_registration" -) - -# Add events for important operations -add_trace_event("user_authenticated", user_id="123", method="jwt") -``` - -### 2. Metrics - -**Dual Export Strategy:** - -Services export metrics in two ways: -1. **Prometheus format** at `/metrics` endpoint (scraped by SigNoz) -2. **OTLP push** directly to SigNoz collector (real-time) - -**Built-in Metrics:** - -```python -# Automatically collected by BaseFastAPIService: -# - http_requests_total -# - http_request_duration_seconds -# - active_connections -``` - -**Custom Metrics:** - -```python -# Define in your service -custom_metrics = { - "user_registrations": { - "type": "counter", - "description": "Total user registrations", - "labels": ["status"] - }, - "login_duration_seconds": { - "type": "histogram", - "description": "Login request duration" - } -} - -service = AuthService(custom_metrics=custom_metrics) - -# Use in your code -service.metrics_collector.increment_counter( - "user_registrations", - labels={"status": "success"} -) -``` - -### 3. Logs - -**Automatic Export:** - -```python -# Logs are automatically exported if OTEL_LOGS_EXPORTER=otlp -import logging -logger = logging.getLogger(__name__) - -# This will appear in SigNoz -logger.info("User logged in", extra={"user_id": "123", "tenant_id": "abc"}) -``` - -**Structured Logging with Context:** - -```python -from shared.monitoring.logs_exporter import add_log_context - -# Add context that persists across log calls -log_ctx = add_log_context( - request_id="req_123", - user_id="user_456", - tenant_id="tenant_789" -) - -# All subsequent logs include this context -log_ctx.info("Processing order") # Includes request_id, user_id, tenant_id -``` - -**Trace Correlation:** - -```python -from shared.monitoring.logs_exporter import get_current_trace_context - -# Get trace context for correlation -trace_ctx = get_current_trace_context() -logger.info("Processing request", extra=trace_ctx) -# Logs now include trace_id and span_id for correlation -``` - -## Verification - -### 1. Check Service Health - -```bash -# Check that services are exporting telemetry -kubectl logs -n bakery-ia deployment/auth-service | grep -i "telemetry\|otel\|signoz" - -# Expected output includes: -# - "Distributed tracing configured" -# - "OpenTelemetry logs export configured" -# - "OpenTelemetry metrics export configured" -``` - -### 2. Access SigNoz UI - -```bash -# Port-forward (for local development) -kubectl port-forward -n signoz svc/signoz-frontend 3301:3301 - -# Or via Ingress -open https://monitoring.bakery-ia.local -``` - -### 3. Verify Data Ingestion - -**Traces:** -1. Go to SigNoz UI β†’ Traces -2. You should see traces from your services -3. Click on a trace to see the full span tree - -**Metrics:** -1. Go to SigNoz UI β†’ Metrics -2. Query: `http_requests_total` -3. Filter by service: `service="auth-service"` - -**Logs:** -1. Go to SigNoz UI β†’ Logs -2. Filter by service: `service_name="auth-service"` -3. Search for specific log messages - -### 4. Test Trace-Log Correlation - -1. Find a trace in SigNoz UI -2. Copy the `trace_id` -3. Go to Logs tab -4. Search: `trace_id=""` -5. You should see all logs for that trace - -## Troubleshooting - -### No Data in SigNoz - -**1. Check OpenTelemetry Collector:** - -```bash -# Check collector logs -kubectl logs -n signoz deployment/signoz-otel-collector - -# Should see: -# - "Receiver is starting" -# - "Exporter is starting" -# - No error messages -``` - -**2. Check Service Configuration:** - -```bash -# Verify environment variables -kubectl get deployment auth-service -n bakery-ia -o yaml | grep -A 20 "env:" - -# Verify annotations -kubectl get deployment auth-service -n bakery-ia -o yaml | grep -A 5 "annotations:" -``` - -**3. Check Network Connectivity:** - -```bash -# Test from service pod -kubectl exec -n bakery-ia deployment/auth-service -- \ - curl -v http://signoz-otel-collector.signoz.svc.cluster.local:4318/v1/traces - -# Should return: 405 Method Not Allowed (POST required) -# If connection refused, check network policies -``` - -### Traces Not Appearing - -**Check instrumentation:** - -```python -# Verify tracing is enabled -import os -print(os.getenv("ENABLE_TRACING")) # Should be "true" -print(os.getenv("OTEL_COLLECTOR_ENDPOINT")) # Should be set -``` - -**Check trace sampling:** - -```bash -# Verify sampling rate (default 100%) -kubectl logs -n bakery-ia deployment/auth-service | grep "sampling" -``` - -### Metrics Not Appearing - -**1. Verify Prometheus annotations:** - -```bash -kubectl get pods -n bakery-ia -o yaml | grep "prometheus.io" -``` - -**2. Test metrics endpoint:** - -```bash -# Port-forward service -kubectl port-forward -n bakery-ia deployment/auth-service 8000:8000 - -# Test endpoint -curl http://localhost:8000/metrics - -# Should return Prometheus format metrics -``` - -**3. Check SigNoz scrape configuration:** - -```bash -# Check collector config -kubectl get configmap -n signoz signoz-otel-collector -o yaml | grep -A 30 "prometheus:" -``` - -### Logs Not Appearing - -**1. Verify log export is enabled:** - -```bash -kubectl get deployment auth-service -n bakery-ia -o yaml | grep OTEL_LOGS_EXPORTER -# Should return: OTEL_LOGS_EXPORTER=otlp -``` - -**2. Check log format:** - -```bash -# Logs should be JSON formatted -kubectl logs -n bakery-ia deployment/auth-service | head -5 -``` - -**3. Verify OTLP endpoint:** - -```bash -# Test logs endpoint -kubectl exec -n bakery-ia deployment/auth-service -- \ - curl -X POST http://signoz-otel-collector.signoz.svc.cluster.local:4318/v1/logs \ - -H "Content-Type: application/json" \ - -d '{"resourceLogs":[]}' - -# Should return 200 OK or 400 Bad Request (not connection error) -``` - -## Performance Tuning - -### For Development - -The default configuration is optimized for local development with minimal resources. - -### For Production - -Update the following in `signoz-values-prod.yaml`: - -```yaml -# Increase collector resources -otelCollector: - resources: - requests: - cpu: 500m - memory: 1Gi - limits: - cpu: 2000m - memory: 2Gi - -# Increase batch sizes -config: - processors: - batch: - timeout: 10s - send_batch_size: 10000 # Increased from 1024 - -# Add more replicas -replicaCount: 2 -``` - -## Best Practices - -1. **Use Structured Logging**: Always use key-value pairs for better querying -2. **Add Context**: Include user_id, tenant_id, request_id in logs -3. **Trace Business Operations**: Add custom spans for important operations -4. **Monitor Collector Health**: Set up alerts for collector errors -5. **Retention Policy**: Configure ClickHouse retention based on needs - -## Additional Resources - -- [SigNoz Documentation](https://signoz.io/docs/) -- [OpenTelemetry Python](https://opentelemetry.io/docs/instrumentation/python/) -- [Bakery IA Monitoring Shared Library](../shared/monitoring/) - -## Support - -For issues or questions: -1. Check SigNoz community: https://signoz.io/slack -2. Review OpenTelemetry docs: https://opentelemetry.io/docs/ -3. Create issue in project repository diff --git a/gateway/app/main.py b/gateway/app/main.py index 8c6fb1f3..70e908df 100644 --- a/gateway/app/main.py +++ b/gateway/app/main.py @@ -28,6 +28,7 @@ from app.middleware.read_only_mode import ReadOnlyModeMiddleware from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context from shared.monitoring.logging import setup_logging from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +from shared.monitoring.system_metrics import SystemMetricsCollector # OpenTelemetry imports from opentelemetry import trace @@ -200,7 +201,12 @@ async def startup_event(): logger.info("Metrics registered successfully") - metrics_collector.start_metrics_server(8080) + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("gateway") + logger.info("System metrics collection started") + + logger.info("Metrics export configured via OpenTelemetry OTLP") logger.info("API Gateway started successfully") @@ -227,13 +233,8 @@ async def health_check(): "timestamp": time.time() } -@app.get("/metrics") -async def metrics(): - """Prometheus metrics endpoint""" - return Response( - content=metrics_collector.get_metrics(), - media_type="text/plain; version=0.0.4; charset=utf-8" - ) +# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz +# The /metrics endpoint is not needed as metrics are pushed automatically # ================================================================ # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS diff --git a/gateway/requirements.txt b/gateway/requirements.txt index ba506f58..3ccfcdfe 100644 --- a/gateway/requirements.txt +++ b/gateway/requirements.txt @@ -19,6 +19,9 @@ sqlalchemy==2.0.44 asyncpg==0.30.0 cryptography==44.0.0 ortools==9.8.3296 + + +psutil==5.9.8 opentelemetry-api==1.39.1 opentelemetry-sdk==1.39.1 opentelemetry-instrumentation-fastapi==0.60b1 diff --git a/infrastructure/kubernetes/setup-database-monitoring.sh b/infrastructure/kubernetes/setup-database-monitoring.sh deleted file mode 100755 index 490dd8d1..00000000 --- a/infrastructure/kubernetes/setup-database-monitoring.sh +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/bash -# Setup script for database monitoring with OpenTelemetry and SigNoz -# This script creates monitoring users in PostgreSQL and deploys the collector - -set -e - -echo "=========================================" -echo "Database Monitoring Setup for SigNoz" -echo "=========================================" -echo "" - -# Configuration -NAMESPACE="bakery-ia" -MONITOR_USER="otel_monitor" -MONITOR_PASSWORD=$(openssl rand -base64 32) - -# PostgreSQL databases to monitor -DATABASES=( - "auth-db-service:auth_db" - "inventory-db-service:inventory_db" - "orders-db-service:orders_db" - "tenant-db-service:tenant_db" - "sales-db-service:sales_db" - "production-db-service:production_db" - "recipes-db-service:recipes_db" - "procurement-db-service:procurement_db" - "distribution-db-service:distribution_db" - "forecasting-db-service:forecasting_db" - "external-db-service:external_db" - "suppliers-db-service:suppliers_db" - "pos-db-service:pos_db" - "training-db-service:training_db" - "notification-db-service:notification_db" - "orchestrator-db-service:orchestrator_db" - "ai-insights-db-service:ai_insights_db" -) - -echo "Step 1: Creating monitoring user in PostgreSQL databases" -echo "=========================================" -echo "" - -for db_entry in "${DATABASES[@]}"; do - IFS=':' read -r service dbname <<< "$db_entry" - - echo "Creating monitoring user in $dbname..." - - # Create monitoring user via kubectl exec - kubectl exec -n "$NAMESPACE" "deployment/${service%-service}" -- psql -U postgres -d "$dbname" -c " - DO \$\$ - BEGIN - IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '$MONITOR_USER') THEN - CREATE USER $MONITOR_USER WITH PASSWORD '$MONITOR_PASSWORD'; - GRANT pg_monitor TO $MONITOR_USER; - GRANT CONNECT ON DATABASE $dbname TO $MONITOR_USER; - RAISE NOTICE 'User $MONITOR_USER created successfully'; - ELSE - RAISE NOTICE 'User $MONITOR_USER already exists'; - END IF; - END - \$\$; - " 2>/dev/null || echo " ⚠️ Warning: Could not create user in $dbname (may already exist or database not ready)" - - echo "" -done - -echo "βœ… Monitoring users created" -echo "" - -echo "Step 2: Creating Kubernetes secret for monitoring credentials" -echo "=========================================" -echo "" - -# Create secret for database monitoring -kubectl create secret generic database-monitor-secrets \ - -n "$NAMESPACE" \ - --from-literal=POSTGRES_MONITOR_USER="$MONITOR_USER" \ - --from-literal=POSTGRES_MONITOR_PASSWORD="$MONITOR_PASSWORD" \ - --dry-run=client -o yaml | kubectl apply -f - - -echo "βœ… Secret created: database-monitor-secrets" -echo "" - -echo "Step 3: Deploying OpenTelemetry collector for database monitoring" -echo "=========================================" -echo "" - -kubectl apply -f infrastructure/kubernetes/base/monitoring/database-otel-collector.yaml - -echo "βœ… Database monitoring collector deployed" -echo "" - -echo "Step 4: Waiting for collector to be ready" -echo "=========================================" -echo "" - -kubectl wait --for=condition=available --timeout=60s \ - deployment/database-otel-collector -n "$NAMESPACE" - -echo "βœ… Collector is ready" -echo "" - -echo "=========================================" -echo "Database Monitoring Setup Complete!" -echo "=========================================" -echo "" -echo "What's been configured:" -echo " βœ… Monitoring user created in all PostgreSQL databases" -echo " βœ… OpenTelemetry collector deployed for database metrics" -echo " βœ… Metrics exported to SigNoz" -echo "" -echo "Metrics being collected:" -echo " πŸ“Š PostgreSQL: connections, commits, rollbacks, deadlocks, table sizes" -echo " πŸ“Š Redis: memory usage, keyspace hits/misses, connected clients" -echo " πŸ“Š RabbitMQ: queue depth, message rates, consumer count" -echo "" -echo "Next steps:" -echo " 1. Check collector logs:" -echo " kubectl logs -n $NAMESPACE deployment/database-otel-collector" -echo "" -echo " 2. View metrics in SigNoz:" -echo " - Go to https://monitoring.bakery-ia.local" -echo " - Create dashboard with queries like:" -echo " * postgresql.backends (connections)" -echo " * postgresql.database.size (database size)" -echo " * redis.memory.used (Redis memory)" -echo " * rabbitmq.message.current (queue depth)" -echo "" -echo " 3. Create alerts for:" -echo " - High connection count (approaching max_connections)" -echo " - Slow query detection (via application traces)" -echo " - High Redis memory usage" -echo " - RabbitMQ queue buildup" -echo "" diff --git a/services/ai_insights/app/main.py b/services/ai_insights/app/main.py index 4c79b0de..32f7b06b 100644 --- a/services/ai_insights/app/main.py +++ b/services/ai_insights/app/main.py @@ -11,6 +11,7 @@ from app.core.database import init_db, close_db from app.api import insights from shared.monitoring.logging import setup_logging from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +from shared.monitoring.system_metrics import SystemMetricsCollector # OpenTelemetry imports from opentelemetry import trace @@ -56,9 +57,12 @@ async def lifespan(app: FastAPI): await init_db() logger.info("Database initialized") - # Start metrics server - metrics_collector.start_metrics_server(8080) - logger.info("Metrics server started on port 8080") + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("ai-insights") + logger.info("System metrics collection started") + + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed + logger.info("Metrics export configured via OpenTelemetry OTLP") yield @@ -131,13 +135,8 @@ async def health_check(): } -@app.get("/metrics") -async def metrics(): - """Prometheus metrics endpoint""" - return Response( - content=metrics_collector.get_metrics(), - media_type="text/plain; version=0.0.4; charset=utf-8" - ) +# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz +# The /metrics endpoint is not needed as metrics are pushed automatically if __name__ == "__main__": diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py index 98614f5d..a22b25b8 100644 --- a/services/alert_processor/app/main.py +++ b/services/alert_processor/app/main.py @@ -16,6 +16,7 @@ from app.api import alerts, sse from shared.redis_utils import initialize_redis, close_redis from shared.monitoring.logging import setup_logging from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +from shared.monitoring.system_metrics import SystemMetricsCollector # OpenTelemetry imports from opentelemetry import trace @@ -82,9 +83,12 @@ async def lifespan(app: FastAPI): await consumer.start() logger.info("alert_processor_started") - # Start metrics server - metrics_collector.start_metrics_server(8080) - logger.info("Metrics server started on port 8080") + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("alert-processor") + logger.info("System metrics collection started") + + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed + logger.info("Metrics export configured via OpenTelemetry OTLP") except Exception as e: logger.error("alert_processor_startup_failed", error=str(e)) raise @@ -175,13 +179,8 @@ async def root(): } -@app.get("/metrics") -async def metrics(): - """Prometheus metrics endpoint""" - return Response( - content=metrics_collector.get_metrics(), - media_type="text/plain; version=0.0.4; charset=utf-8" - ) +# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz +# The /metrics endpoint is not needed as metrics are pushed automatically if __name__ == "__main__": diff --git a/services/demo_session/app/main.py b/services/demo_session/app/main.py index 61a15b79..c3e61f5e 100644 --- a/services/demo_session/app/main.py +++ b/services/demo_session/app/main.py @@ -15,6 +15,7 @@ from app.api import demo_sessions, demo_accounts, demo_operations, internal from shared.redis_utils import initialize_redis, close_redis from shared.monitoring.logging import setup_logging from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +from shared.monitoring.system_metrics import SystemMetricsCollector # OpenTelemetry imports from opentelemetry import trace @@ -69,9 +70,12 @@ async def lifespan(app: FastAPI): max_connections=50 ) - # Start metrics server - metrics_collector.start_metrics_server(8080) - logger.info("Metrics server started on port 8080") + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("demo-session") + logger.info("System metrics collection started") + + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed + logger.info("Metrics export configured via OpenTelemetry OTLP") logger.info("Demo Session Service started successfully") @@ -164,13 +168,8 @@ async def health(): } -@app.get("/metrics") -async def metrics(): - """Prometheus metrics endpoint""" - return Response( - content=metrics_collector.get_metrics(), - media_type="text/plain; version=0.0.4; charset=utf-8" - ) +# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz +# The /metrics endpoint is not needed as metrics are pushed automatically if __name__ == "__main__": diff --git a/services/demo_session/app/monitoring/metrics.py b/services/demo_session/app/monitoring/metrics.py deleted file mode 100644 index 27221388..00000000 --- a/services/demo_session/app/monitoring/metrics.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Prometheus metrics for demo session service -""" - -from prometheus_client import Counter, Histogram, Gauge - -# Counters -demo_sessions_created_total = Counter( - 'demo_sessions_created_total', - 'Total number of demo sessions created', - ['tier', 'status'] -) - -demo_sessions_deleted_total = Counter( - 'demo_sessions_deleted_total', - 'Total number of demo sessions deleted', - ['tier', 'status'] -) - -demo_cloning_errors_total = Counter( - 'demo_cloning_errors_total', - 'Total number of cloning errors', - ['tier', 'service', 'error_type'] -) - -# Histograms (for latency percentiles) -demo_session_creation_duration_seconds = Histogram( - 'demo_session_creation_duration_seconds', - 'Duration of demo session creation', - ['tier'], - buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60] -) - -demo_service_clone_duration_seconds = Histogram( - 'demo_service_clone_duration_seconds', - 'Duration of individual service cloning', - ['tier', 'service'], - buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50] -) - -demo_session_cleanup_duration_seconds = Histogram( - 'demo_session_cleanup_duration_seconds', - 'Duration of demo session cleanup', - ['tier'], - buckets=[0.5, 1, 2, 5, 10, 15, 20, 30] -) - -# Gauges -demo_sessions_active = Gauge( - 'demo_sessions_active', - 'Number of currently active demo sessions', - ['tier'] -) - -demo_sessions_pending_cleanup = Gauge( - 'demo_sessions_pending_cleanup', - 'Number of demo sessions pending cleanup' -) - -# Alert generation metrics -demo_alerts_generated_total = Counter( - 'demo_alerts_generated_total', - 'Total number of alerts generated post-clone', - ['tier', 'alert_type'] -) - -demo_ai_insights_generated_total = Counter( - 'demo_ai_insights_generated_total', - 'Total number of AI insights generated post-clone', - ['tier', 'insight_type'] -) - -# Cross-service metrics -demo_cross_service_calls_total = Counter( - 'demo_cross_service_calls_total', - 'Total number of cross-service API calls during cloning', - ['source_service', 'target_service', 'status'] -) - -demo_cross_service_call_duration_seconds = Histogram( - 'demo_cross_service_call_duration_seconds', - 'Duration of cross-service API calls during cloning', - ['source_service', 'target_service'], - buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30] -) \ No newline at end of file diff --git a/services/demo_session/app/services/cleanup_service.py b/services/demo_session/app/services/cleanup_service.py index 76061a1e..04ac43a7 100644 --- a/services/demo_session/app/services/cleanup_service.py +++ b/services/demo_session/app/services/cleanup_service.py @@ -14,11 +14,6 @@ import os from app.models import DemoSession, DemoSessionStatus from datetime import datetime, timezone, timedelta from app.core.redis_wrapper import DemoRedisWrapper -from app.monitoring.metrics import ( - demo_sessions_deleted_total, - demo_session_cleanup_duration_seconds, - demo_sessions_active -) logger = structlog.get_logger() diff --git a/services/demo_session/app/services/clone_orchestrator.py b/services/demo_session/app/services/clone_orchestrator.py index a6957e8f..d72ccc2f 100644 --- a/services/demo_session/app/services/clone_orchestrator.py +++ b/services/demo_session/app/services/clone_orchestrator.py @@ -15,17 +15,6 @@ from shared.clients.inventory_client import InventoryServiceClient from shared.clients.production_client import ProductionServiceClient from shared.clients.procurement_client import ProcurementServiceClient from shared.config.base import BaseServiceSettings -from app.monitoring.metrics import ( - demo_sessions_created_total, - demo_session_creation_duration_seconds, - demo_service_clone_duration_seconds, - demo_cloning_errors_total, - demo_sessions_active, - demo_alerts_generated_total, - demo_ai_insights_generated_total, - demo_cross_service_calls_total, - demo_cross_service_call_duration_seconds -) logger = structlog.get_logger() diff --git a/services/notification/app/main.py b/services/notification/app/main.py index a3c1a0f6..ecbab3b2 100644 --- a/services/notification/app/main.py +++ b/services/notification/app/main.py @@ -22,6 +22,7 @@ from app.services.whatsapp_service import WhatsAppService from app.consumers.po_event_consumer import POEventConsumer from shared.service_base import StandardFastAPIService from shared.clients.tenant_client import TenantServiceClient +from shared.monitoring.system_metrics import SystemMetricsCollector import asyncio @@ -184,6 +185,10 @@ class NotificationService(StandardFastAPIService): self.email_service = EmailService() self.whatsapp_service = WhatsAppService(tenant_client=self.tenant_client) + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("notification") + self.logger.info("System metrics collection started") + # Initialize SSE service self.sse_service = SSEService() await self.sse_service.initialize(settings.REDIS_URL) @@ -271,12 +276,14 @@ class NotificationService(StandardFastAPIService): return {"error": "SSE service not available"} # Metrics endpoint - @self.app.get("/metrics") - async def metrics(): - """Prometheus metrics endpoint""" - if self.metrics_collector: - return self.metrics_collector.get_metrics() - return {"metrics": "not_available"} + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz + # The /metrics endpoint is not needed as metrics are pushed automatically + # @self.app.get("/metrics") + # async def metrics(): + # """Prometheus metrics endpoint""" + # if self.metrics_collector: + # return self.metrics_collector.get_metrics() + # return {"metrics": "not_available"} # Create service instance diff --git a/services/tenant/app/main.py b/services/tenant/app/main.py index dba91eb5..1bb85e6a 100644 --- a/services/tenant/app/main.py +++ b/services/tenant/app/main.py @@ -9,6 +9,7 @@ from app.core.config import settings from app.core.database import database_manager from app.api import tenants, tenant_members, tenant_operations, webhooks, plans, subscription, tenant_settings, whatsapp_admin, usage_forecast, enterprise_upgrade, tenant_locations, tenant_hierarchy, internal_demo, network_alerts, onboarding from shared.service_base import StandardFastAPIService +from shared.monitoring.system_metrics import SystemMetricsCollector class TenantService(StandardFastAPIService): @@ -77,6 +78,10 @@ class TenantService(StandardFastAPIService): redis_client = await get_redis_client() self.logger.info("Redis initialized successfully") + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("tenant") + self.logger.info("System metrics collection started") + # Start usage tracking scheduler from app.jobs.usage_tracking_scheduler import start_scheduler await start_scheduler(self.database_manager, redis_client, settings) @@ -108,12 +113,14 @@ class TenantService(StandardFastAPIService): def setup_custom_endpoints(self): """Setup custom endpoints for tenant service""" - @self.app.get("/metrics") - async def metrics(): - """Prometheus metrics endpoint""" - if self.metrics_collector: - return self.metrics_collector.get_metrics() - return {"metrics": "not_available"} + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz + # The /metrics endpoint is not needed as metrics are pushed automatically + # @self.app.get("/metrics") + # async def metrics(): + # """Prometheus metrics endpoint""" + # if self.metrics_collector: + # return self.metrics_collector.get_metrics() + # return {"metrics": "not_available"} # Create service instance diff --git a/services/training/app/main.py b/services/training/app/main.py index 60f6a415..89e76c3b 100644 --- a/services/training/app/main.py +++ b/services/training/app/main.py @@ -15,6 +15,7 @@ from app.api import training_jobs, training_operations, models, health, monitori from app.services.training_events import setup_messaging, cleanup_messaging from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers from shared.service_base import StandardFastAPIService +from shared.monitoring.system_metrics import SystemMetricsCollector class TrainingService(StandardFastAPIService): @@ -77,6 +78,11 @@ class TrainingService(StandardFastAPIService): async def on_startup(self, app: FastAPI): """Custom startup logic including migration verification""" await self.verify_migrations() + + # Initialize system metrics collection + system_metrics = SystemMetricsCollector("training") + self.logger.info("System metrics collection started") + self.logger.info("Training service startup completed") async def on_shutdown(self, app: FastAPI): @@ -132,12 +138,14 @@ class TrainingService(StandardFastAPIService): def setup_custom_endpoints(self): """Setup custom endpoints for training service""" - @self.app.get("/metrics") - async def get_metrics(): - """Prometheus metrics endpoint""" - if self.metrics_collector: - return self.metrics_collector.get_metrics() - return {"status": "metrics not available"} + # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz + # The /metrics endpoint is not needed as metrics are pushed automatically + # @self.app.get("/metrics") + # async def get_metrics(): + # """Prometheus metrics endpoint""" + # if self.metrics_collector: + # return self.metrics_collector.get_metrics() + # return {"status": "metrics not available"} @self.app.get("/") async def root(): diff --git a/shared/monitoring/alert_metrics.py b/shared/monitoring/alert_metrics.py deleted file mode 100755 index a8b385fb..00000000 --- a/shared/monitoring/alert_metrics.py +++ /dev/null @@ -1,420 +0,0 @@ -# shared/monitoring/alert_metrics.py -""" -Metrics and monitoring for the alert and recommendation system -Provides comprehensive metrics for tracking system performance and effectiveness -""" - -from prometheus_client import Counter, Histogram, Gauge, Summary, Info -from typing import Dict, Any -import time -from functools import wraps -import structlog - -logger = structlog.get_logger() - -# ================================================================= -# DETECTION METRICS -# ================================================================= - -# Alert and recommendation generation -items_published = Counter( - 'alert_items_published_total', - 'Total number of alerts and recommendations published', - ['service', 'item_type', 'severity', 'type'] -) - -item_checks_performed = Counter( - 'alert_checks_performed_total', - 'Total number of alert checks performed', - ['service', 'check_type', 'pattern'] -) - -item_check_duration = Histogram( - 'alert_check_duration_seconds', - 'Time taken to perform alert checks', - ['service', 'check_type'], - buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60] -) - -alert_detection_errors = Counter( - 'alert_detection_errors_total', - 'Total number of errors during alert detection', - ['service', 'error_type', 'check_type'] -) - -# Deduplication metrics -duplicate_items_prevented = Counter( - 'duplicate_items_prevented_total', - 'Number of duplicate alerts/recommendations prevented', - ['service', 'item_type', 'type'] -) - -# ================================================================= -# PROCESSING METRICS -# ================================================================= - -# Alert processor metrics -items_processed = Counter( - 'alert_items_processed_total', - 'Total number of items processed by alert processor', - ['item_type', 'severity', 'type', 'status'] -) - -item_processing_duration = Histogram( - 'alert_processing_duration_seconds', - 'Time taken to process alerts/recommendations', - ['item_type', 'severity'], - buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5] -) - -database_storage_duration = Histogram( - 'alert_database_storage_duration_seconds', - 'Time taken to store items in database', - buckets=[0.01, 0.05, 0.1, 0.5, 1] -) - -processing_errors = Counter( - 'alert_processing_errors_total', - 'Total number of processing errors', - ['error_type', 'item_type'] -) - -# ================================================================= -# DELIVERY METRICS -# ================================================================= - -# Notification delivery -notifications_sent = Counter( - 'alert_notifications_sent_total', - 'Total notifications sent through all channels', - ['channel', 'item_type', 'severity', 'status'] -) - -notification_delivery_duration = Histogram( - 'alert_notification_delivery_duration_seconds', - 'Time from item generation to delivery', - ['item_type', 'severity', 'channel'], - buckets=[0.1, 0.5, 1, 5, 10, 30, 60] -) - -delivery_failures = Counter( - 'alert_delivery_failures_total', - 'Failed notification deliveries', - ['channel', 'item_type', 'error_type'] -) - -# Channel-specific metrics -email_notifications = Counter( - 'alert_email_notifications_total', - 'Email notifications sent', - ['status', 'item_type'] -) - -whatsapp_notifications = Counter( - 'alert_whatsapp_notifications_total', - 'WhatsApp notifications sent', - ['status', 'item_type'] -) - -sse_events_sent = Counter( - 'alert_sse_events_sent_total', - 'SSE events sent to dashboard', - ['tenant', 'event_type', 'item_type'] -) - -# ================================================================= -# SSE METRICS -# ================================================================= - -# SSE connection metrics -sse_active_connections = Gauge( - 'alert_sse_active_connections', - 'Number of active SSE connections', - ['tenant_id'] -) - -sse_connection_duration = Histogram( - 'alert_sse_connection_duration_seconds', - 'Duration of SSE connections', - buckets=[10, 30, 60, 300, 600, 1800, 3600] -) - -sse_message_queue_size = Gauge( - 'alert_sse_message_queue_size', - 'Current size of SSE message queues', - ['tenant_id'] -) - -sse_connection_errors = Counter( - 'alert_sse_connection_errors_total', - 'SSE connection errors', - ['error_type', 'tenant_id'] -) - -# ================================================================= -# SYSTEM HEALTH METRICS -# ================================================================= - -# Active items gauge -active_items_gauge = Gauge( - 'alert_active_items_current', - 'Current number of active alerts and recommendations', - ['tenant_id', 'item_type', 'severity'] -) - -# System component health -system_component_health = Gauge( - 'alert_system_component_health', - 'Health status of alert system components (1=healthy, 0=unhealthy)', - ['component', 'service'] -) - -# Leader election status -scheduler_leader_status = Gauge( - 'alert_scheduler_leader_status', - 'Leader election status for schedulers (1=leader, 0=follower)', - ['service'] -) - -# Message queue health -rabbitmq_connection_status = Gauge( - 'alert_rabbitmq_connection_status', - 'RabbitMQ connection status (1=connected, 0=disconnected)', - ['service'] -) - -redis_connection_status = Gauge( - 'alert_redis_connection_status', - 'Redis connection status (1=connected, 0=disconnected)', - ['service'] -) - -# ================================================================= -# BUSINESS METRICS -# ================================================================= - -# Alert response metrics -items_acknowledged = Counter( - 'alert_items_acknowledged_total', - 'Number of items acknowledged by users', - ['item_type', 'severity', 'service'] -) - -items_resolved = Counter( - 'alert_items_resolved_total', - 'Number of items resolved by users', - ['item_type', 'severity', 'service'] -) - -item_response_time = Histogram( - 'alert_item_response_time_seconds', - 'Time from item creation to acknowledgment', - ['item_type', 'severity'], - buckets=[60, 300, 600, 1800, 3600, 7200, 14400] -) - -# Recommendation adoption -recommendations_implemented = Counter( - 'alert_recommendations_implemented_total', - 'Number of recommendations marked as implemented', - ['type', 'service'] -) - -# Effectiveness metrics -false_positive_rate = Gauge( - 'alert_false_positive_rate', - 'Rate of false positive alerts', - ['service', 'alert_type'] -) - -# ================================================================= -# PERFORMANCE DECORATORS -# ================================================================= - -def track_duration(metric: Histogram, **labels): - """Decorator to track function execution time""" - def decorator(func): - @wraps(func) - async def async_wrapper(*args, **kwargs): - start_time = time.time() - try: - result = await func(*args, **kwargs) - metric.labels(**labels).observe(time.time() - start_time) - return result - except Exception as e: - # Track error duration too - metric.labels(**labels).observe(time.time() - start_time) - raise - - @wraps(func) - def sync_wrapper(*args, **kwargs): - start_time = time.time() - try: - result = func(*args, **kwargs) - metric.labels(**labels).observe(time.time() - start_time) - return result - except Exception as e: - metric.labels(**labels).observe(time.time() - start_time) - raise - - return async_wrapper if hasattr(func, '__code__') and func.__code__.co_flags & 0x80 else sync_wrapper - return decorator - -def track_errors(error_counter: Counter, **labels): - """Decorator to track errors in functions""" - def decorator(func): - @wraps(func) - async def async_wrapper(*args, **kwargs): - try: - return await func(*args, **kwargs) - except Exception as e: - error_counter.labels(error_type=type(e).__name__, **labels).inc() - raise - - @wraps(func) - def sync_wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - error_counter.labels(error_type=type(e).__name__, **labels).inc() - raise - - return async_wrapper if hasattr(func, '__code__') and func.__code__.co_flags & 0x80 else sync_wrapper - return decorator - -# ================================================================= -# UTILITY FUNCTIONS -# ================================================================= - -def record_item_published(service: str, item_type: str, severity: str, alert_type: str): - """Record that an item was published""" - items_published.labels( - service=service, - item_type=item_type, - severity=severity, - type=alert_type - ).inc() - -def record_item_processed(item_type: str, severity: str, alert_type: str, status: str): - """Record that an item was processed""" - items_processed.labels( - item_type=item_type, - severity=severity, - type=alert_type, - status=status - ).inc() - -def record_notification_sent(channel: str, item_type: str, severity: str, status: str): - """Record notification delivery""" - notifications_sent.labels( - channel=channel, - item_type=item_type, - severity=severity, - status=status - ).inc() - -def update_active_items(tenant_id: str, item_type: str, severity: str, count: int): - """Update active items gauge""" - active_items_gauge.labels( - tenant_id=tenant_id, - item_type=item_type, - severity=severity - ).set(count) - -def update_component_health(component: str, service: str, is_healthy: bool): - """Update component health status""" - system_component_health.labels( - component=component, - service=service - ).set(1 if is_healthy else 0) - -def update_connection_status(connection_type: str, service: str, is_connected: bool): - """Update connection status""" - if connection_type == 'rabbitmq': - rabbitmq_connection_status.labels(service=service).set(1 if is_connected else 0) - elif connection_type == 'redis': - redis_connection_status.labels(service=service).set(1 if is_connected else 0) - -# ================================================================= -# METRICS AGGREGATOR -# ================================================================= - -class AlertMetricsCollector: - """Centralized metrics collector for alert system""" - - def __init__(self, service_name: str): - self.service_name = service_name - - def record_check_performed(self, check_type: str, pattern: str): - """Record that a check was performed""" - item_checks_performed.labels( - service=self.service_name, - check_type=check_type, - pattern=pattern - ).inc() - - def record_detection_error(self, error_type: str, check_type: str): - """Record detection error""" - alert_detection_errors.labels( - service=self.service_name, - error_type=error_type, - check_type=check_type - ).inc() - - def record_duplicate_prevented(self, item_type: str, alert_type: str): - """Record prevented duplicate""" - duplicate_items_prevented.labels( - service=self.service_name, - item_type=item_type, - type=alert_type - ).inc() - - def update_leader_status(self, is_leader: bool): - """Update leader election status""" - scheduler_leader_status.labels(service=self.service_name).set(1 if is_leader else 0) - - def get_service_metrics(self) -> Dict[str, Any]: - """Get all metrics for this service""" - return { - 'service': self.service_name, - 'items_published': items_published._value._value, - 'checks_performed': item_checks_performed._value._value, - 'detection_errors': alert_detection_errors._value._value, - 'duplicates_prevented': duplicate_items_prevented._value._value - } - -# ================================================================= -# DASHBOARD METRICS -# ================================================================= - -def get_system_overview_metrics() -> Dict[str, Any]: - """Get overview metrics for monitoring dashboard""" - try: - return { - 'total_items_published': sum(items_published._value._value.values()), - 'total_checks_performed': sum(item_checks_performed._value._value.values()), - 'total_notifications_sent': sum(notifications_sent._value._value.values()), - 'active_sse_connections': sum(sse_active_connections._value._value.values()), - 'processing_errors': sum(processing_errors._value._value.values()), - 'delivery_failures': sum(delivery_failures._value._value.values()), - 'timestamp': time.time() - } - except Exception as e: - logger.error("Error collecting overview metrics", error=str(e)) - return {'error': str(e), 'timestamp': time.time()} - -def get_tenant_metrics(tenant_id: str) -> Dict[str, Any]: - """Get metrics for a specific tenant""" - try: - return { - 'tenant_id': tenant_id, - 'active_connections': sse_active_connections.labels(tenant_id=tenant_id)._value._value, - 'events_sent': sum([ - v for k, v in sse_events_sent._value._value.items() - if k[0] == tenant_id - ]), - 'timestamp': time.time() - } - except Exception as e: - logger.error("Error collecting tenant metrics", tenant_id=tenant_id, error=str(e)) - return {'tenant_id': tenant_id, 'error': str(e), 'timestamp': time.time()} \ No newline at end of file diff --git a/shared/monitoring/scheduler_metrics.py b/shared/monitoring/scheduler_metrics.py deleted file mode 100755 index 1b79b002..00000000 --- a/shared/monitoring/scheduler_metrics.py +++ /dev/null @@ -1,258 +0,0 @@ -# shared/monitoring/scheduler_metrics.py -""" -Scheduler Metrics - Prometheus metrics for production and procurement schedulers - -Provides comprehensive metrics for monitoring automated daily planning: -- Scheduler execution success/failure rates -- Tenant processing times -- Cache hit rates for forecasts -- Plan generation statistics -""" - -from prometheus_client import Counter, Histogram, Gauge, Info -import structlog - -logger = structlog.get_logger() - -# ================================================================ -# PRODUCTION SCHEDULER METRICS -# ================================================================ - -production_schedules_generated_total = Counter( - 'production_schedules_generated_total', - 'Total number of production schedules generated', - ['tenant_id', 'status'] # status: success, failure -) - -production_schedule_generation_duration_seconds = Histogram( - 'production_schedule_generation_duration_seconds', - 'Time taken to generate production schedule per tenant', - ['tenant_id'], - buckets=[1, 5, 10, 30, 60, 120, 180, 300] # seconds -) - -production_tenants_processed_total = Counter( - 'production_tenants_processed_total', - 'Total number of tenants processed by production scheduler', - ['status'] # status: success, failure, timeout -) - -production_batches_created_total = Counter( - 'production_batches_created_total', - 'Total number of production batches created', - ['tenant_id'] -) - -production_scheduler_runs_total = Counter( - 'production_scheduler_runs_total', - 'Total number of production scheduler executions', - ['trigger'] # trigger: scheduled, manual, test -) - -production_scheduler_errors_total = Counter( - 'production_scheduler_errors_total', - 'Total number of production scheduler errors', - ['error_type'] -) - -# ================================================================ -# PROCUREMENT SCHEDULER METRICS -# ================================================================ - -procurement_plans_generated_total = Counter( - 'procurement_plans_generated_total', - 'Total number of procurement plans generated', - ['tenant_id', 'status'] # status: success, failure -) - -procurement_plan_generation_duration_seconds = Histogram( - 'procurement_plan_generation_duration_seconds', - 'Time taken to generate procurement plan per tenant', - ['tenant_id'], - buckets=[1, 5, 10, 30, 60, 120, 180, 300] -) - -procurement_tenants_processed_total = Counter( - 'procurement_tenants_processed_total', - 'Total number of tenants processed by procurement scheduler', - ['status'] # status: success, failure, timeout -) - -procurement_requirements_created_total = Counter( - 'procurement_requirements_created_total', - 'Total number of procurement requirements created', - ['tenant_id', 'priority'] # priority: critical, high, medium, low -) - -procurement_scheduler_runs_total = Counter( - 'procurement_scheduler_runs_total', - 'Total number of procurement scheduler executions', - ['trigger'] # trigger: scheduled, manual, test -) - -procurement_plan_rejections_total = Counter( - 'procurement_plan_rejections_total', - 'Total number of procurement plans rejected', - ['tenant_id', 'auto_regenerated'] # auto_regenerated: true, false -) - -procurement_plans_by_status = Gauge( - 'procurement_plans_by_status', - 'Number of procurement plans by status', - ['tenant_id', 'status'] -) - -# ================================================================ -# FORECAST CACHING METRICS -# ================================================================ - -forecast_cache_hits_total = Counter( - 'forecast_cache_hits_total', - 'Total number of forecast cache hits', - ['tenant_id'] -) - -forecast_cache_misses_total = Counter( - 'forecast_cache_misses_total', - 'Total number of forecast cache misses', - ['tenant_id'] -) - -forecast_cache_hit_rate = Gauge( - 'forecast_cache_hit_rate', - 'Forecast cache hit rate percentage (0-100)', - ['tenant_id'] -) - -forecast_cache_entries_total = Gauge( - 'forecast_cache_entries_total', - 'Total number of entries in forecast cache', - ['cache_type'] # cache_type: single, batch -) - -forecast_cache_invalidations_total = Counter( - 'forecast_cache_invalidations_total', - 'Total number of forecast cache invalidations', - ['tenant_id', 'reason'] # reason: model_retrain, manual, expiry -) - -# ================================================================ -# GENERAL SCHEDULER HEALTH METRICS -# ================================================================ - -scheduler_health_status = Gauge( - 'scheduler_health_status', - 'Scheduler health status (1=healthy, 0=unhealthy)', - ['service', 'scheduler_type'] # service: production, orders; scheduler_type: daily, weekly, cleanup -) - -scheduler_last_run_timestamp = Gauge( - 'scheduler_last_run_timestamp', - 'Unix timestamp of last scheduler run', - ['service', 'scheduler_type'] -) - -scheduler_next_run_timestamp = Gauge( - 'scheduler_next_run_timestamp', - 'Unix timestamp of next scheduled run', - ['service', 'scheduler_type'] -) - -tenant_processing_timeout_total = Counter( - 'tenant_processing_timeout_total', - 'Total number of tenant processing timeouts', - ['service', 'tenant_id'] # service: production, procurement -) - -# ================================================================ -# HELPER FUNCTIONS FOR METRICS -# ================================================================ - - -class SchedulerMetricsCollector: - """Helper class for collecting scheduler metrics""" - - @staticmethod - def record_production_schedule_generated(tenant_id: str, success: bool, duration_seconds: float, batches_created: int): - """Record production schedule generation""" - status = 'success' if success else 'failure' - production_schedules_generated_total.labels(tenant_id=tenant_id, status=status).inc() - production_schedule_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds) - - if success: - production_batches_created_total.labels(tenant_id=tenant_id).inc(batches_created) - - @staticmethod - def record_procurement_plan_generated(tenant_id: str, success: bool, duration_seconds: float, requirements_count: int): - """Record procurement plan generation""" - status = 'success' if success else 'failure' - procurement_plans_generated_total.labels(tenant_id=tenant_id, status=status).inc() - procurement_plan_generation_duration_seconds.labels(tenant_id=tenant_id).observe(duration_seconds) - - if success: - procurement_requirements_created_total.labels( - tenant_id=tenant_id, - priority='medium' # Default, should be updated with actual priority - ).inc(requirements_count) - - @staticmethod - def record_scheduler_run(service: str, trigger: str = 'scheduled'): - """Record scheduler execution""" - if service == 'production': - production_scheduler_runs_total.labels(trigger=trigger).inc() - elif service == 'procurement': - procurement_scheduler_runs_total.labels(trigger=trigger).inc() - - @staticmethod - def record_tenant_processing(service: str, status: str): - """Record tenant processing result""" - if service == 'production': - production_tenants_processed_total.labels(status=status).inc() - elif service == 'procurement': - procurement_tenants_processed_total.labels(status=status).inc() - - @staticmethod - def record_forecast_cache_lookup(tenant_id: str, hit: bool): - """Record forecast cache lookup""" - if hit: - forecast_cache_hits_total.labels(tenant_id=tenant_id).inc() - else: - forecast_cache_misses_total.labels(tenant_id=tenant_id).inc() - - @staticmethod - def update_forecast_cache_hit_rate(tenant_id: str, hit_rate_percent: float): - """Update forecast cache hit rate""" - forecast_cache_hit_rate.labels(tenant_id=tenant_id).set(hit_rate_percent) - - @staticmethod - def record_plan_rejection(tenant_id: str, auto_regenerated: bool): - """Record procurement plan rejection""" - procurement_plan_rejections_total.labels( - tenant_id=tenant_id, - auto_regenerated='true' if auto_regenerated else 'false' - ).inc() - - @staticmethod - def update_scheduler_health(service: str, scheduler_type: str, is_healthy: bool): - """Update scheduler health status""" - scheduler_health_status.labels( - service=service, - scheduler_type=scheduler_type - ).set(1 if is_healthy else 0) - - @staticmethod - def record_timeout(service: str, tenant_id: str): - """Record tenant processing timeout""" - tenant_processing_timeout_total.labels( - service=service, - tenant_id=tenant_id - ).inc() - - -# Global metrics collector instance -metrics_collector = SchedulerMetricsCollector() - - -def get_scheduler_metrics_collector() -> SchedulerMetricsCollector: - """Get global scheduler metrics collector""" - return metrics_collector