diff --git a/Tiltfile b/Tiltfile index 1ae26389..750e8f98 100644 --- a/Tiltfile +++ b/Tiltfile @@ -16,22 +16,142 @@ # - Gateway only rebuilds when gateway/ or shared/ code changes # ============================================================================= + # ============================================================================= # TILT CONFIGURATION # ============================================================================= +# Update settings +update_settings( + max_parallel_updates=2, # Reduce parallel updates to avoid resource exhaustion + k8s_upsert_timeout_secs=120 # Increase timeout for slower local builds +) + # Ensure we're running in the correct context allow_k8s_contexts('kind-bakery-ia-local') +# ============================================================================= +# DISK SPACE MANAGEMENT & CLEANUP CONFIGURATION +# ============================================================================= + +# Disk space management settings +disk_cleanup_enabled = True # Default to True, can be disabled with TILT_DISABLE_CLEANUP=true +if 'TILT_DISABLE_CLEANUP' in os.environ: + disk_cleanup_enabled = os.environ['TILT_DISABLE_CLEANUP'].lower() != 'true' + +disk_space_threshold_gb = '10' +if 'TILT_DISK_THRESHOLD_GB' in os.environ: + disk_space_threshold_gb = os.environ['TILT_DISK_THRESHOLD_GB'] + +disk_cleanup_frequency_minutes = '30' +if 'TILT_CLEANUP_FREQUENCY' in os.environ: + disk_cleanup_frequency_minutes = os.environ['TILT_CLEANUP_FREQUENCY'] + +print(""" +DISK SPACE MANAGEMENT CONFIGURATION +====================================== +Cleanup Enabled: {} +Free Space Threshold: {}GB +Cleanup Frequency: Every {} minutes + +To disable cleanup: export TILT_DISABLE_CLEANUP=true +To change threshold: export TILT_DISK_THRESHOLD_GB=20 +To change frequency: export TILT_CLEANUP_FREQUENCY=60 +""".format( + 'YES' if disk_cleanup_enabled else 'NO (TILT_DISABLE_CLEANUP=true)', + disk_space_threshold_gb, + disk_cleanup_frequency_minutes +)) + +# Automatic cleanup scheduler (informational only - actual scheduling done externally) +if disk_cleanup_enabled: + local_resource( + 'automatic-disk-cleanup-info', + cmd=''' + echo "Automatic disk cleanup is ENABLED" + echo "Settings:" + echo " - Threshold: ''' + disk_space_threshold_gb + ''' GB free space" + echo " - Frequency: Every ''' + disk_cleanup_frequency_minutes + ''' minutes" + echo "" + echo "Note: Actual cleanup runs via external scheduling (cron job or similar)" + echo "To run cleanup now: tilt trigger manual-disk-cleanup" + ''', + labels=['99-cleanup'], + auto_init=True, + allow_parallel=False + ) + +# Manual cleanup trigger (can be run on demand) +local_resource( + 'manual-disk-cleanup', + cmd=''' + echo "Starting manual disk cleanup..." + python3 scripts/cleanup_disk_space.py --manual --verbose + ''', + labels=['99-cleanup'], + auto_init=False, + allow_parallel=False +) + +# Disk space monitoring resource +local_resource( + 'disk-space-monitor', + cmd=''' + echo "DISK SPACE MONITORING" + echo "======================================" + + # Get disk usage + df -h / | grep -v Filesystem | awk '{{print "Total: " $2 " | Used: " $3 " | Free: " $4 " | Usage: " $5}}' + + # Get Docker disk usage + echo "" + echo "DOCKER DISK USAGE:" + docker system df + + # Get Kubernetes disk usage (if available) + echo "" + echo "KUBERNETES DISK USAGE:" + kubectl get pvc -n bakery-ia --no-headers 2>/dev/null | awk '{{print "PVC: " $1 " | Status: " $2 " | Capacity: " $3 " | Used: " $4}}' || echo " Kubernetes PVCs not available" + + echo "" + echo "Cleanup Status:" + if [ "{disk_cleanup_enabled}" = "True" ]; then + echo " Automatic cleanup: ENABLED (every {disk_cleanup_frequency_minutes} minutes)" + echo " Threshold: {disk_space_threshold_gb}GB free space" + else + echo " Automatic cleanup: DISABLED" + echo " To enable: unset TILT_DISABLE_CLEANUP or set TILT_DISABLE_CLEANUP=false" + fi + + echo "" + echo "Manual cleanup commands:" + echo " tilt trigger manual-disk-cleanup # Run cleanup now" + echo " docker system prune -a # Manual Docker cleanup" + echo " kubectl delete jobs --all # Clean up completed jobs" + ''', + labels=['99-cleanup'], + auto_init=False, + allow_parallel=False +) + +# ============================================================================= +# DOCKER REGISTRY CONFIGURATION +# ============================================================================= + # Docker registry configuration # Set USE_DOCKERHUB=true environment variable to push images to Docker Hub # Otherwise, uses local registry for faster builds and deployments -use_dockerhub = os.getenv('USE_DOCKERHUB', 'false').lower() == 'true' -dockerhub_username = os.getenv('DOCKERHUB_USERNAME', 'uals') +use_dockerhub = False # Default to False +if 'USE_DOCKERHUB' in os.environ: + use_dockerhub = os.environ['USE_DOCKERHUB'].lower() == 'true' + +dockerhub_username = 'uals' # Default username +if 'DOCKERHUB_USERNAME' in os.environ: + dockerhub_username = os.environ['DOCKERHUB_USERNAME'] if use_dockerhub: print(""" - ๐Ÿณ DOCKER HUB MODE ENABLED + DOCKER HUB MODE ENABLED Images will be pushed to Docker Hub: docker.io/%s Make sure you're logged in: docker login To disable: unset USE_DOCKERHUB or set USE_DOCKERHUB=false @@ -39,7 +159,7 @@ if use_dockerhub: default_registry('docker.io/%s' % dockerhub_username) else: print(""" - ๐Ÿ  LOCAL REGISTRY MODE + LOCAL REGISTRY MODE Using local registry for faster builds: localhost:5001 This registry is created by kubernetes_restart.sh script To use Docker Hub: export USE_DOCKERHUB=true @@ -52,20 +172,21 @@ else: print(""" ====================================== -๐Ÿ” Bakery IA Secure Development Mode +Bakery IA Secure Development Mode ====================================== Security Features: - โœ… TLS encryption for PostgreSQL and Redis - โœ… Strong 32-character passwords - โœ… PersistentVolumeClaims (no data loss) - โœ… pgcrypto extension for encryption - โœ… PostgreSQL audit logging + TLS encryption for PostgreSQL and Redis + Strong 32-character passwords + PersistentVolumeClaims (no data loss) + Column encryption: pgcrypto extension + Audit logging: PostgreSQL query logging + Object storage: MinIO with TLS for ML models Monitoring: - ๐Ÿ“Š Service metrics available at /metrics endpoints - ๐Ÿ” Telemetry ready (traces, metrics, logs) - โ„น๏ธ SigNoz deployment optional for local dev (see signoz-info resource) + Service metrics available at /metrics endpoints + Telemetry ready (traces, metrics, logs) + SigNoz deployment optional for local dev (see signoz-info resource) Applying security configurations... """) @@ -74,7 +195,7 @@ Applying security configurations... local_resource( 'dockerhub-secret', cmd=''' - echo "๐Ÿณ Setting up Docker Hub image pull secret..." + echo "Setting up Docker Hub image pull secret..." # Check if Docker Hub credentials are available if [ -n "$DOCKERHUB_USERNAME" ] && [ -n "$DOCKERHUB_PASSWORD" ]; then @@ -84,7 +205,7 @@ local_resource( echo " Attempting to use Docker CLI credentials..." ./infrastructure/kubernetes/create-dockerhub-secret.sh else - echo " โš ๏ธ Docker Hub credentials not found" + echo " Docker Hub credentials not found" echo " To enable automatic Docker Hub authentication:" echo " 1. Run 'docker login', OR" echo " 2. Set environment variables:" @@ -103,13 +224,13 @@ local_resource( local_resource( 'security-setup', cmd=''' - echo "๐Ÿ“ฆ Applying security secrets and configurations..." + echo "Applying security secrets and configurations..." kubectl apply -f infrastructure/kubernetes/base/secrets.yaml kubectl apply -f infrastructure/kubernetes/base/secrets/postgres-tls-secret.yaml kubectl apply -f infrastructure/kubernetes/base/secrets/redis-tls-secret.yaml kubectl apply -f infrastructure/kubernetes/base/configs/postgres-init-config.yaml kubectl apply -f infrastructure/kubernetes/base/configmaps/postgres-logging-config.yaml - echo "โœ… Security configurations applied" + echo "Security configurations applied" ''', resource_deps=['dockerhub-secret'], labels=['00-security'], @@ -120,7 +241,7 @@ local_resource( local_resource( 'verify-tls', cmd=''' - echo "๐Ÿ” Verifying TLS configuration..." + echo "Verifying TLS configuration..." sleep 5 # Wait for pods to be ready # Check if auth-db pod exists and has TLS certs @@ -129,8 +250,8 @@ local_resource( if [ -n "$AUTH_POD" ]; then echo " Checking PostgreSQL TLS certificates..." kubectl exec -n bakery-ia "$AUTH_POD" -- ls -la /tls/ 2>/dev/null && \ - echo " โœ… PostgreSQL TLS certificates mounted" || \ - echo " โš ๏ธ PostgreSQL TLS certificates not found (pods may still be starting)" + echo " PostgreSQL TLS certificates mounted" || \ + echo " PostgreSQL TLS certificates not found (pods may still be starting)" fi # Check if redis pod exists and has TLS certs @@ -139,15 +260,14 @@ local_resource( if [ -n "$REDIS_POD" ]; then echo " Checking Redis TLS certificates..." kubectl exec -n bakery-ia "$REDIS_POD" -- ls -la /tls/ 2>/dev/null && \ - echo " โœ… Redis TLS certificates mounted" || \ - echo " โš ๏ธ Redis TLS certificates not found (pods may still be starting)" + echo " Redis TLS certificates mounted" || \ + echo " Redis TLS certificates not found (pods may still be starting)" fi - echo "โœ… TLS verification complete" + echo "TLS verification complete" ''', resource_deps=['auth-db', 'redis'], auto_init=True, - trigger_mode=TRIGGER_MODE_MANUAL, labels=['00-security'] ) @@ -155,15 +275,14 @@ local_resource( local_resource( 'verify-pvcs', cmd=''' - echo "๐Ÿ” Verifying PersistentVolumeClaims..." - kubectl get pvc -n bakery-ia | grep -E "NAME|db-pvc" || echo " โš ๏ธ PVCs not yet bound" + echo "Verifying PersistentVolumeClaims..." + kubectl get pvc -n bakery-ia | grep -E "NAME|db-pvc" || echo " PVCs not yet bound" PVC_COUNT=$(kubectl get pvc -n bakery-ia -o json | jq '.items | length') echo " Found $PVC_COUNT PVCs" - echo "โœ… PVC verification complete" + echo "PVC verification complete" ''', resource_deps=['auth-db'], auto_init=True, - trigger_mode=TRIGGER_MODE_MANUAL, labels=['00-security'] ) @@ -171,11 +290,11 @@ local_resource( local_resource( 'cert-manager-install', cmd=''' - echo "๐Ÿ“ฆ Installing cert-manager..." + echo "Installing cert-manager..." # Check if cert-manager CRDs already exist if kubectl get crd certificates.cert-manager.io >/dev/null 2>&1; then - echo " โœ… cert-manager CRDs already installed" + echo " cert-manager CRDs already installed" else echo " Installing cert-manager v1.13.2..." kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml @@ -184,10 +303,10 @@ local_resource( kubectl wait --for=condition=available --timeout=120s deployment/cert-manager -n cert-manager kubectl wait --for=condition=available --timeout=120s deployment/cert-manager-webhook -n cert-manager - echo " โœ… cert-manager installed and ready" + echo " cert-manager installed and ready" fi - echo "โœ… cert-manager verification complete" + echo "cert-manager verification complete" ''', labels=['00-security'], auto_init=True @@ -265,19 +384,21 @@ def build_python_service(service_name, service_path): # ============================================================================= # Frontend (React + Vite) -frontend_debug_env = os.getenv('FRONTEND_DEBUG', 'false') +frontend_debug_env = 'false' # Default to false +if 'FRONTEND_DEBUG' in os.environ: + frontend_debug_env = os.environ['FRONTEND_DEBUG'] frontend_debug = frontend_debug_env.lower() == 'true' if frontend_debug: print(""" - ๐Ÿ› FRONTEND DEBUG MODE ENABLED + FRONTEND DEBUG MODE ENABLED Building frontend with NO minification for easier debugging. Full React error messages will be displayed. To disable: unset FRONTEND_DEBUG or set FRONTEND_DEBUG=false """) else: print(""" - ๐Ÿ“ฆ FRONTEND PRODUCTION MODE + FRONTEND PRODUCTION MODE Building frontend with minification for optimized performance. To enable debug mode: export FRONTEND_DEBUG=true """) @@ -384,6 +505,10 @@ k8s_resource('redis', resource_deps=['security-setup'], labels=['01-infrastructu k8s_resource('rabbitmq', labels=['01-infrastructure']) k8s_resource('nominatim', labels=['01-infrastructure']) +# MinIO Storage +k8s_resource('minio', resource_deps=['security-setup'], labels=['01-infrastructure']) +k8s_resource('minio-bucket-init', resource_deps=['minio'], labels=['01-infrastructure']) + # ============================================================================= # MONITORING RESOURCES - SigNoz (Unified Observability) # ============================================================================= @@ -392,25 +517,25 @@ k8s_resource('nominatim', labels=['01-infrastructure']) local_resource( 'signoz-deploy', cmd=''' - echo "๐Ÿ“Š Deploying SigNoz Monitoring Stack..." + echo "Deploying SigNoz Monitoring Stack..." echo "" # Ensure Docker Hub secret exists in bakery-ia namespace - echo "๐Ÿ” Ensuring Docker Hub secret exists in bakery-ia namespace..." + echo "Ensuring Docker Hub secret exists in bakery-ia namespace..." if ! kubectl get secret dockerhub-creds -n bakery-ia &>/dev/null; then - echo " โš ๏ธ Docker Hub secret not found, attempting to create..." + echo " Docker Hub secret not found, attempting to create..." ./infrastructure/kubernetes/create-dockerhub-secret.sh || echo " Continuing without Docker Hub authentication..." else - echo " โœ… Docker Hub secret exists" + echo " Docker Hub secret exists" fi echo "" # Check if SigNoz is already deployed if helm list -n bakery-ia | grep -q signoz; then - echo "โœ… SigNoz already deployed, checking status..." + echo "SigNoz already deployed, checking status..." helm status signoz -n bakery-ia else - echo "๐Ÿš€ Installing SigNoz..." + echo "Installing SigNoz..." # Add SigNoz Helm repository if not already added helm repo add signoz https://charts.signoz.io 2>/dev/null || true @@ -424,25 +549,23 @@ local_resource( --wait echo "" - echo "โœ… SigNoz deployment completed" + echo "SigNoz deployment completed" fi echo "" - echo "๐Ÿ“ˆ SigNoz Access Information:" + echo "SigNoz Access Information:" echo " URL: https://monitoring.bakery-ia.local" echo " Username: admin" echo " Password: admin" echo "" - echo "๐Ÿ”ง OpenTelemetry Collector Endpoints:" + echo "OpenTelemetry Collector Endpoints:" echo " gRPC: localhost:4317" echo " HTTP: localhost:4318" echo "" - echo "๐Ÿ’ก To check pod status: kubectl get pods -n signoz" + echo "To check pod status: kubectl get pods -n signoz" ''', labels=['05-monitoring'], auto_init=False, - trigger_mode=TRIGGER_MODE_MANUAL, - allow_parallel=False ) # Track SigNoz pods in Tilt UI using workload tracking @@ -450,7 +573,7 @@ local_resource( local_resource( 'signoz-status', cmd=''' - echo "๐Ÿ“Š SigNoz Status Check" + echo "SigNoz Status Check" echo "" # Check pod status @@ -470,19 +593,17 @@ local_resource( echo "Pod Status: $READY_PODS/$TOTAL_PODS ready" if [ "$READY_PODS" -eq "$TOTAL_PODS" ]; then - echo "โœ… All SigNoz pods are running!" + echo "All SigNoz pods are running!" echo "" echo "Access SigNoz at: https://monitoring.bakery-ia.local" echo "Credentials: admin / admin" else - echo "โณ Waiting for pods to become ready..." + echo "Waiting for pods to become ready..." fi fi ''', labels=['05-monitoring'], - resource_deps=['signoz-deploy'], auto_init=False, - trigger_mode=TRIGGER_MODE_MANUAL ) # Optional exporters (in monitoring namespace) - DISABLED since using SigNoz @@ -566,7 +687,6 @@ k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init']) k8s_resource('nominatim-init', labels=['08-data-init']) -# ============================================================================= # ============================================================================= # APPLICATION SERVICES # ============================================================================= @@ -618,15 +738,9 @@ k8s_resource('demo-session-cleanup', resource_deps=['demo-session-service'], lab k8s_resource('external-data-rotation', resource_deps=['external-service'], labels=['16-cronjobs']) # ============================================================================= -# TILT CONFIGURATION +# WATCH SETTINGS # ============================================================================= -# Update settings -update_settings( - max_parallel_updates=2, # Reduce parallel updates to avoid resource exhaustion - k8s_upsert_timeout_secs=120 # Increase timeout for slower local builds -) - # Watch settings watch_settings( ignore=[ @@ -665,18 +779,19 @@ watch_settings( # ============================================================================= print(""" -โœ… Security setup complete! +Security setup complete! Database Security Features Active: - ๐Ÿ” TLS encryption: PostgreSQL and Redis - ๐Ÿ”‘ Strong passwords: 32-character cryptographic - ๐Ÿ’พ Persistent storage: PVCs for all databases - ๐Ÿ”’ Column encryption: pgcrypto extension - ๐Ÿ“‹ Audit logging: PostgreSQL query logging + TLS encryption: PostgreSQL and Redis + Strong passwords: 32-character cryptographic + Persistent storage: PVCs for all databases + Column encryption: pgcrypto extension + Audit logging: PostgreSQL query logging Internal Schedulers Active: - โฐ Alert Priority Recalculation: Hourly @ :15 (alert-processor) - โฐ Usage Tracking: Daily @ 2:00 AM UTC (tenant-service) + Alert Priority Recalculation: Hourly @ :15 (alert-processor) + Usage Tracking: Daily @ 2:00 AM UTC (tenant-service) + Disk Cleanup: Every {disk_cleanup_frequency_minutes} minutes (threshold: {disk_space_threshold_gb}GB) Access your application: Main Application: https://bakery-ia.local @@ -708,11 +823,11 @@ Documentation: docs/DATABASE_SECURITY_ANALYSIS_REPORT.md Build Optimization Active: - โœ… Services only rebuild when their code changes - โœ… Shared folder changes trigger ALL services (as expected) - โœ… Reduces unnecessary rebuilds and disk usage - ๐Ÿ’ก Edit service code: only that service rebuilds - ๐Ÿ’ก Edit shared/ code: all services rebuild (required) + Services only rebuild when their code changes + Shared folder changes trigger ALL services (as expected) + Reduces unnecessary rebuilds and disk usage + Edit service code: only that service rebuilds + Edit shared/ code: all services rebuild (required) Useful Commands: # Work on specific services only @@ -730,4 +845,4 @@ DNS Configuration: # 127.0.0.1 monitoring.bakery-ia.local ====================================== -""") +""") \ No newline at end of file diff --git a/docs/MINIO_CERTIFICATE_GENERATION_GUIDE.md b/docs/MINIO_CERTIFICATE_GENERATION_GUIDE.md new file mode 100644 index 00000000..04d01f04 --- /dev/null +++ b/docs/MINIO_CERTIFICATE_GENERATION_GUIDE.md @@ -0,0 +1,154 @@ +# MinIO Certificate Generation Guide + +## Quick Start + +To generate MinIO certificates with the correct format: + +```bash +# Generate certificates +./infrastructure/tls/generate-minio-certificates.sh + +# Update Kubernetes secret +kubectl delete secret -n bakery-ia minio-tls +kubectl apply -f infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml + +# Restart MinIO +kubectl rollout restart deployment -n bakery-ia minio +``` + +## Key Requirements + +### Private Key Format +โœ… **Required**: Traditional RSA format (`BEGIN RSA PRIVATE KEY`) +โŒ **Problematic**: PKCS#8 format (`BEGIN PRIVATE KEY`) + +### Certificate Files +- `minio-cert.pem` - Server certificate +- `minio-key.pem` - Private key (must be traditional RSA format) +- `ca-cert.pem` - CA certificate + +## Verification + +### Check Private Key Format +```bash +head -1 infrastructure/tls/minio/minio-key.pem +# Should output: -----BEGIN RSA PRIVATE KEY----- +``` + +### Verify Certificate Chain +```bash +openssl verify -CAfile infrastructure/tls/ca/ca-cert.pem \ + infrastructure/tls/minio/minio-cert.pem +``` + +### Check Certificate Details +```bash +openssl x509 -in infrastructure/tls/minio/minio-cert.pem -noout \ + -subject -issuer -dates +``` + +## Troubleshooting + +### Error: "The private key contains additional data" +**Cause**: Private key is in PKCS#8 format instead of traditional RSA format + +**Solution**: Convert the key: +```bash +openssl rsa -in minio-key.pem -traditional -out minio-key-fixed.pem +mv minio-key-fixed.pem minio-key.pem +``` + +### Error: "Unable to parse private key" +**Cause**: Certificate/key mismatch or corrupted files + +**Solution**: Regenerate certificates and verify: +```bash +# Check modulus of certificate and key (should match) +openssl x509 -noout -modulus -in minio-cert.pem | openssl md5 +openssl rsa -noout -modulus -in minio-key.pem | openssl md5 +``` + +## Certificate Rotation + +### Step-by-Step Process + +1. **Generate new certificates** + ```bash + ./infrastructure/tls/generate-minio-certificates.sh + ``` + +2. **Update base64 values in secret** + ```bash + # Update infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml + # with new base64 encoded certificate values + ``` + +3. **Apply updated secret** + ```bash + kubectl delete secret -n bakery-ia minio-tls + kubectl apply -f infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml + ``` + +4. **Restart MinIO pods** + ```bash + kubectl rollout restart deployment -n bakery-ia minio + ``` + +5. **Verify** + ```bash + kubectl logs -n bakery-ia -l app.kubernetes.io/name=minio --tail=5 + # Should show: API: https://minio.bakery-ia.svc.cluster.local:9000 + ``` + +## Technical Details + +### Certificate Generation Process + +1. **Generate private key** (RSA 4096-bit) +2. **Convert to traditional RSA format** (critical for MinIO) +3. **Create CSR** with proper SANs +4. **Sign with CA** (valid for 3 years) +5. **Set permissions** (600 for key, 644 for certs) + +### SANs (Subject Alternative Names) + +The certificate includes these SANs for comprehensive coverage: +- `minio.bakery-ia.svc.cluster.local` (primary) +- `minio.bakery-ia` +- `minio-console.bakery-ia.svc.cluster.local` +- `minio-console.bakery-ia` +- `minio` +- `minio-console` +- `localhost` +- `127.0.0.1` + +### Secret Structure + +The Kubernetes secret uses the standardized Opaque format: + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls + namespace: bakery-ia +type: Opaque +data: + ca-cert.pem: + minio-cert.pem: + minio-key.pem: +``` + +## Best Practices + +1. **Always verify private key format** before applying +2. **Test certificates** with `openssl verify` before deployment +3. **Use the generation script** to ensure consistency +4. **Document certificate expiration dates** for rotation planning +5. **Monitor MinIO logs** after certificate updates + +## Related Documentation + +- [MinIO TLS Fix Summary](MINIO_TLS_FIX_SUMMARY.md) +- [Kubernetes TLS Secrets Guide](../kubernetes-tls-guide.md) +- [Certificate Management Best Practices](../certificate-management.md) \ No newline at end of file diff --git a/frontend/nginx.conf b/frontend/nginx.conf index 2c5891ea..241b550a 100644 --- a/frontend/nginx.conf +++ b/frontend/nginx.conf @@ -34,20 +34,47 @@ server { # Note: API routing is handled by ingress, not by this nginx # The frontend makes requests to /api which are routed by the ingress controller - # Static assets with aggressive caching (including source maps for debugging) - location ~* ^/assets/.*\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot|map)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - add_header Vary Accept-Encoding; + # Source map files - serve with proper CORS headers and content type + # Note: These are typically only needed in development, but served in production for error reporting + location ~* ^/assets/.*\.map$ { + # Short cache time to avoid mismatches with JS files + expires 1m; + add_header Cache-Control "public, must-revalidate"; + add_header Access-Control-Allow-Origin "*"; + add_header Access-Control-Allow-Methods "GET"; + add_header Access-Control-Allow-Headers "Content-Type"; + add_header Content-Type "application/json"; + # Disable access logging for source maps as they're requested frequently access_log off; try_files $uri =404; } - # Also handle JS and CSS files anywhere in the structure (for dynamic imports) - location ~* \.(js|css)$ { + # Static assets with appropriate caching + # Note: JS/CSS files have content hashes for cache busting, but use shorter cache times to handle deployment issues + location ~* ^/assets/.*\.(js|css)$ { + expires 1h; + add_header Cache-Control "public"; + add_header Vary Accept-Encoding; + add_header Access-Control-Allow-Origin "*"; + access_log off; + try_files $uri =404; + } + + # Static assets that don't change often (images, fonts) can have longer cache times + location ~* ^/assets/.*\.(png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { expires 1y; add_header Cache-Control "public, immutable"; add_header Vary Accept-Encoding; + add_header Access-Control-Allow-Origin "*"; + access_log off; + try_files $uri =404; + } + + # Handle JS and CSS files anywhere in the structure (for dynamic imports) with shorter cache + location ~* \.(js|css)$ { + expires 1h; + add_header Cache-Control "public"; + add_header Vary Accept-Encoding; access_log off; try_files $uri =404; } diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 6a4a92dc..b406c226 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -9,6 +9,13 @@ "version": "2.0.0", "dependencies": { "@hookform/resolvers": "^3.3.2", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/exporter-metrics-otlp-http": "^0.210.0", + "@opentelemetry/exporter-trace-otlp-http": "^0.210.0", + "@opentelemetry/resources": "^2.4.0", + "@opentelemetry/sdk-metrics": "^2.4.0", + "@opentelemetry/sdk-trace-web": "^2.4.0", + "@opentelemetry/semantic-conventions": "^1.39.0", "@radix-ui/react-accordion": "^1.1.2", "@radix-ui/react-checkbox": "^1.0.4", "@radix-ui/react-dialog": "^1.0.5", @@ -2976,6 +2983,209 @@ "dev": true, "license": "MIT" }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "license": "Apache-2.0", + "peer": true, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/api-logs": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.210.0.tgz", + "integrity": "sha512-CMtLxp+lYDriveZejpBND/2TmadrrhUfChyxzmkFtHaMDdSKfP59MAYyA0ICBvEBdm3iXwLcaj/8Ic/pnGw9Yg==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/core": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.4.0.tgz", + "integrity": "sha512-KtcyFHssTn5ZgDu6SXmUznS80OFs/wN7y6MyFRRcKU6TOw8hNcGxKvt8hsdaLJfhzUszNSjURetq5Qpkad14Gw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/exporter-metrics-otlp-http": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-metrics-otlp-http/-/exporter-metrics-otlp-http-0.210.0.tgz", + "integrity": "sha512-JpLThG8Hh8A/Jzdzw9i4Ftu+EzvLaX/LouN+mOOHmadL0iror0Qsi3QWzucXeiUsDDsiYgjfKyi09e6sltytgA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/otlp-exporter-base": "0.210.0", + "@opentelemetry/otlp-transformer": "0.210.0", + "@opentelemetry/resources": "2.4.0", + "@opentelemetry/sdk-metrics": "2.4.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/exporter-trace-otlp-http": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-trace-otlp-http/-/exporter-trace-otlp-http-0.210.0.tgz", + "integrity": "sha512-9JkyaCl70anEtuKZdoCQmjDuz1/paEixY/DWfsvHt7PGKq3t8/nQ/6/xwxHjG+SkPAUbo1Iq4h7STe7Pk2bc5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/otlp-exporter-base": "0.210.0", + "@opentelemetry/otlp-transformer": "0.210.0", + "@opentelemetry/resources": "2.4.0", + "@opentelemetry/sdk-trace-base": "2.4.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/otlp-exporter-base": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.210.0.tgz", + "integrity": "sha512-uk78DcZoBNHIm26h0oXc8Pizh4KDJ/y04N5k/UaI9J7xR7mL8QcMcYPQG9xxN7m8qotXOMDRW6qTAyptav4+3w==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/otlp-transformer": "0.210.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.210.0.tgz", + "integrity": "sha512-nkHBJVSJGOwkRZl+BFIr7gikA93/U8XkL2EWaiDbj3DVjmTEZQpegIKk0lT8oqQYfP8FC6zWNjuTfkaBVqa0ZQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.210.0", + "@opentelemetry/core": "2.4.0", + "@opentelemetry/resources": "2.4.0", + "@opentelemetry/sdk-logs": "0.210.0", + "@opentelemetry/sdk-metrics": "2.4.0", + "@opentelemetry/sdk-trace-base": "2.4.0", + "protobufjs": "8.0.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/resources": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.4.0.tgz", + "integrity": "sha512-RWvGLj2lMDZd7M/5tjkI/2VHMpXebLgPKvBUd9LRasEWR2xAynDwEYZuLvY9P2NGG73HF07jbbgWX2C9oavcQg==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-logs": { + "version": "0.210.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.210.0.tgz", + "integrity": "sha512-YuaL92Dpyk/Kc1o4e9XiaWWwiC0aBFN+4oy+6A9TP4UNJmRymPMEX10r6EMMFMD7V0hktiSig9cwWo59peeLCQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.210.0", + "@opentelemetry/core": "2.4.0", + "@opentelemetry/resources": "2.4.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.4.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-metrics": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.4.0.tgz", + "integrity": "sha512-qSbfq9mXbLMqmPEjijl32f3ZEmiHekebRggPdPjhHI6t1CsAQOR2Aw/SuTDftk3/l2aaPHpwP3xM2DkgBA1ANw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/resources": "2.4.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.9.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.4.0.tgz", + "integrity": "sha512-WH0xXkz/OHORDLKqaxcUZS0X+t1s7gGlumr2ebiEgNZQl2b0upK2cdoD0tatf7l8iP74woGJ/Kmxe82jdvcWRw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/resources": "2.4.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-web": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-web/-/sdk-trace-web-2.4.0.tgz", + "integrity": "sha512-1FYg7qnrgTugPev51SehxCp0v9J4P97MJn2MaXQ8QK//psfyLDorKAAC3LmSIhq7XaC726WSZ/Wm69r8NdjIsA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.4.0", + "@opentelemetry/sdk-trace-base": "2.4.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/semantic-conventions": { + "version": "1.39.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.39.0.tgz", + "integrity": "sha512-R5R9tb2AXs2IRLNKLBJDynhkfmx7mX0vi8NkhZb3gUkPWHn6HXk5J8iQ/dql0U3ApfWym4kXXmBDRGO+oeOfjg==", + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -3010,6 +3220,70 @@ "dev": true, "license": "MIT" }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", + "license": "BSD-3-Clause" + }, "node_modules/@radix-ui/number": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/@radix-ui/number/-/number-1.1.1.tgz", @@ -6577,7 +6851,6 @@ "version": "20.19.17", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.17.tgz", "integrity": "sha512-gfehUI8N1z92kygssiuWvLiwcbOB3IRktR6hTDgJlXMYh5OvkPSRmgfoBUmfZt+vhwJtX7v1Yw4KvvAf7c5QKQ==", - "dev": true, "license": "MIT", "dependencies": { "undici-types": "~6.21.0" @@ -11721,6 +11994,12 @@ "dev": true, "license": "MIT" }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", + "license": "Apache-2.0" + }, "node_modules/loose-envify": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", @@ -13119,6 +13398,30 @@ "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==", "license": "MIT" }, + "node_modules/protobufjs": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-8.0.0.tgz", + "integrity": "sha512-jx6+sE9h/UryaCZhsJWbJtTEy47yXoGNYI4z8ZaRncM0zBKeRqjO2JEcOUYwrYGb1WLhXM1FfMzW3annvFv0rw==", + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -15451,7 +15754,6 @@ "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", - "dev": true, "license": "MIT" }, "node_modules/unicode-canonical-property-names-ecmascript": { diff --git a/frontend/package.json b/frontend/package.json index a82e3776..069b823a 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -30,6 +30,13 @@ }, "dependencies": { "@hookform/resolvers": "^3.3.2", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/exporter-metrics-otlp-http": "^0.210.0", + "@opentelemetry/exporter-trace-otlp-http": "^0.210.0", + "@opentelemetry/resources": "^2.4.0", + "@opentelemetry/sdk-metrics": "^2.4.0", + "@opentelemetry/sdk-trace-web": "^2.4.0", + "@opentelemetry/semantic-conventions": "^1.39.0", "@radix-ui/react-accordion": "^1.1.2", "@radix-ui/react-checkbox": "^1.0.4", "@radix-ui/react-dialog": "^1.0.5", diff --git a/frontend/src/components/AnalyticsTestComponent.tsx b/frontend/src/components/AnalyticsTestComponent.tsx new file mode 100644 index 00000000..8cee8996 --- /dev/null +++ b/frontend/src/components/AnalyticsTestComponent.tsx @@ -0,0 +1,66 @@ +import React, { useState } from 'react'; +import { trackUserAction, trackUserLocation } from '../utils/analytics'; + +const AnalyticsTestComponent: React.FC = () => { + const [locationStatus, setLocationStatus] = useState('Not requested'); + const [actionStatus, setActionStatus] = useState(''); + + const handleTrackLocation = async () => { + try { + setLocationStatus('Requesting...'); + await trackUserLocation(); + setLocationStatus('Location tracked successfully!'); + } catch (error) { + setLocationStatus('Error tracking location'); + console.error('Location tracking error:', error); + } + }; + + const handleTrackAction = () => { + const actionName = `button_click_${Date.now()}`; + trackUserAction(actionName, { + component: 'AnalyticsTestComponent', + timestamp: new Date().toISOString() + }); + setActionStatus(`Action "${actionName}" tracked`); + }; + + return ( +
+

Analytics Test Component

+ +
+ + {locationStatus} +
+ +
+ + {actionStatus} +
+ +
+

Expected Behavior:

+
    +
  • Page views are automatically tracked when this component loads
  • +
  • Session information is captured on initial load
  • +
  • Browser and device info is collected automatically
  • +
  • Clicking buttons will generate user action traces
  • +
  • Location tracking requires user permission
  • +
+
+
+ ); +}; + +export default AnalyticsTestComponent; \ No newline at end of file diff --git a/frontend/src/config/runtime.ts b/frontend/src/config/runtime.ts index 8facbae7..c1ec1263 100644 --- a/frontend/src/config/runtime.ts +++ b/frontend/src/config/runtime.ts @@ -5,6 +5,9 @@ interface RuntimeConfig { VITE_API_URL: string; VITE_APP_TITLE: string; VITE_APP_VERSION: string; + VITE_OTEL_TRACES_ENDPOINT?: string; + VITE_OTEL_METRICS_ENDPOINT?: string; + VITE_OTEL_ENABLED?: string; } declare global { @@ -27,6 +30,9 @@ function getRuntimeConfig(): RuntimeConfig { VITE_API_URL: import.meta.env.VITE_API_URL || 'http://localhost:8000', VITE_APP_TITLE: import.meta.env.VITE_APP_TITLE || 'PanIA Dashboard', VITE_APP_VERSION: import.meta.env.VITE_APP_VERSION || '1.0.0', + VITE_OTEL_TRACES_ENDPOINT: import.meta.env.VITE_OTEL_TRACES_ENDPOINT || '/api/v1/telemetry/v1/traces', + VITE_OTEL_METRICS_ENDPOINT: import.meta.env.VITE_OTEL_METRICS_ENDPOINT || '/api/v1/telemetry/v1/metrics', + VITE_OTEL_ENABLED: import.meta.env.VITE_OTEL_ENABLED || 'true', }; } @@ -52,6 +58,21 @@ export function isKubernetesEnvironment(): boolean { return typeof window !== 'undefined' && !!window.__RUNTIME_CONFIG__; } +// Helper to check if OpenTelemetry is enabled +export function isOpenTelemetryEnabled(): boolean { + return config.VITE_OTEL_ENABLED?.toLowerCase() !== 'false'; +} + +// Helper to get OpenTelemetry traces endpoint +export function getOtelTracesEndpoint(): string { + return config.VITE_OTEL_TRACES_ENDPOINT || '/api/v1/telemetry/v1/traces'; +} + +// Helper to get OpenTelemetry metrics endpoint +export function getOtelMetricsEndpoint(): string { + return config.VITE_OTEL_METRICS_ENDPOINT || '/api/v1/telemetry/v1/metrics'; +} + // Debug function to log current configuration export function logConfig(): void { console.log('Current configuration:', { diff --git a/frontend/src/hooks/useAnalytics.ts b/frontend/src/hooks/useAnalytics.ts new file mode 100644 index 00000000..b28d10f7 --- /dev/null +++ b/frontend/src/hooks/useAnalytics.ts @@ -0,0 +1,33 @@ +import { + trackPageView, + trackUserAction, + trackUserLocation, + trackSession, + getCurrentUserId, + isAnalyticsEnabled +} from '../utils/analytics'; + +/** + * React Hook for analytics + * + * NOTE: Page view tracking is handled globally by initializeAnalytics() in main.tsx. + * This hook only exposes tracking functions for use in components. + * Do NOT add automatic page tracking here to avoid duplicate events. + */ +export const useAnalytics = () => { + return { + // Manual page view tracking (use only for custom page events, not navigation) + trackPageView, + // Track user actions (button clicks, form submissions, etc.) + trackUserAction, + // Track user location (requires consent) + trackUserLocation, + // Track session (typically called once at app init) + trackSession, + // Get current user ID + getCurrentUserId, + // Check if analytics are enabled + isAnalyticsEnabled + }; +}; + diff --git a/frontend/src/main.tsx b/frontend/src/main.tsx index 5ca332e9..9fb801f8 100644 --- a/frontend/src/main.tsx +++ b/frontend/src/main.tsx @@ -7,6 +7,92 @@ import './styles/animations.css'; import './styles/themes/light.css'; import './styles/themes/dark.css'; +// OpenTelemetry Web SDK initialization +import { WebTracerProvider } from '@opentelemetry/sdk-trace-web'; +import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { resourceFromAttributes } from '@opentelemetry/resources'; +import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic-conventions'; +import { MeterProvider, PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics'; +import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http'; +import { metrics } from '@opentelemetry/api'; + +// Import analytics utilities +import { initializeAnalytics } from './utils/analytics'; + +// Import configuration +import { isOpenTelemetryEnabled, getOtelTracesEndpoint, getOtelMetricsEndpoint } from './config/runtime'; + +// Store cleanup function for proper teardown +let analyticsCleanup: (() => void) | null = null; + +// Initialize OpenTelemetry +const initOpenTelemetry = () => { + // Check if OpenTelemetry is enabled in configuration + if (!isOpenTelemetryEnabled()) { + console.log('OpenTelemetry disabled by configuration'); + return; + } + + try { + // Create resource with service information using non-deprecated attributes + const resource = resourceFromAttributes({ + [ATTR_SERVICE_NAME]: 'bakery-frontend', + [ATTR_SERVICE_VERSION]: '1.0.0' + }); + + // Initialize tracer with span processor + const traceExporter = new OTLPTraceExporter({ + url: getOtelTracesEndpoint() // Using configured endpoint + }); + + const traceProvider = new WebTracerProvider({ + resource: resource, + // Add span processors as array for current OpenTelemetry SDK version + spanProcessors: [new BatchSpanProcessor(traceExporter)] + }); + + traceProvider.register(); + + // Initialize metrics + const metricExporter = new OTLPMetricExporter({ + url: getOtelMetricsEndpoint() + }); + + const metricReader = new PeriodicExportingMetricReader({ + exporter: metricExporter, + exportIntervalMillis: 10000, // 10 seconds + }); + + // Use the MeterProvider constructor with readers array + const meterProvider = new MeterProvider({ + resource: resource, + readers: [metricReader] + }); + + // Register the meter provider globally using proper API + metrics.setGlobalMeterProvider(meterProvider); + + console.log('OpenTelemetry initialized for frontend'); + } catch (error) { + console.error('Failed to initialize OpenTelemetry:', error); + // Continue without OpenTelemetry if initialization fails + } +}; + +// Initialize OpenTelemetry before rendering the app +initOpenTelemetry(); + +// Initialize analytics tracking and store cleanup function +analyticsCleanup = initializeAnalytics(); + +// Cleanup on page unload +window.addEventListener('beforeunload', () => { + if (analyticsCleanup) { + analyticsCleanup(); + } +}); + // PWA/ServiceWorker functionality removed to avoid conflicts in development ReactDOM.createRoot(document.getElementById('root')!).render( diff --git a/frontend/src/utils/analytics.ts b/frontend/src/utils/analytics.ts new file mode 100644 index 00000000..888743f0 --- /dev/null +++ b/frontend/src/utils/analytics.ts @@ -0,0 +1,301 @@ +import { trace } from '@opentelemetry/api'; +import { ATTR_HTTP_ROUTE } from '@opentelemetry/semantic-conventions'; + +// Types and Interfaces +interface AnalyticsMetadata { + [key: string]: string | number | boolean | undefined; +} + +// Constants +const ANALYTICS_ENABLED_KEY = 'analyticsEnabled'; +const LOCATION_CONSENT_KEY = 'locationTrackingConsent'; +const SESSION_ID_KEY = 'sessionId'; +const USER_ID_KEY = 'userId'; + +// Generate a unique session ID +const generateSessionId = (): string => { + return Date.now().toString(36) + Math.random().toString(36).substring(2); +}; + +// Get current user ID (implement based on your auth system) +export const getCurrentUserId = (): string | null => { + // This is a placeholder - implement based on your authentication system + // For example, you might get this from localStorage, cookies, or context + return localStorage.getItem(USER_ID_KEY) || sessionStorage.getItem(USER_ID_KEY) || null; +}; + +// Track page view +export const trackPageView = (pathname: string): void => { + // Check if analytics are enabled + if (!isAnalyticsEnabled()) { + return; + } + + try { + const tracer = trace.getTracer('bakery-frontend'); + const user_id = getCurrentUserId(); + + const span = tracer.startSpan('page_view', { + attributes: { + [ATTR_HTTP_ROUTE]: pathname, + 'user.id': user_id || 'anonymous', + 'page.path': pathname, + } + }); + + // End the span immediately for page views + span.end(); + } catch (error) { + console.error('Failed to track page view:', error); + } +}; + +// Check if analytics are enabled +export const isAnalyticsEnabled = (): boolean => { + return localStorage.getItem(ANALYTICS_ENABLED_KEY) !== 'false'; +}; + +// Enable or disable analytics +export const setAnalyticsEnabled = (enabled: boolean): void => { + localStorage.setItem(ANALYTICS_ENABLED_KEY, enabled.toString()); +}; + +// Check if location tracking consent is granted +export const isLocationTrackingConsentGranted = (): boolean => { + return localStorage.getItem(LOCATION_CONSENT_KEY) === 'granted'; +}; + +// Set location tracking consent +export const setLocationTrackingConsent = (granted: boolean): void => { + localStorage.setItem(LOCATION_CONSENT_KEY, granted ? 'granted' : 'denied'); +}; + +// Track user session +export const trackSession = (): (() => void) => { + // Check if analytics are enabled + if (!isAnalyticsEnabled()) { + console.log('Analytics disabled by user preference'); + return () => {}; // Return no-op cleanup function + } + + try { + const tracer = trace.getTracer('bakery-frontend'); + const sessionId = generateSessionId(); + const userId = getCurrentUserId(); + + const span = tracer.startSpan('user_session', { + attributes: { + 'session.id': sessionId, + 'user.id': userId || 'anonymous', + 'browser.user_agent': navigator.userAgent, + 'screen.width': window.screen.width.toString(), + 'screen.height': window.screen.height.toString(), + 'device.type': /mobile|tablet|ipad|iphone|ipod|android|silk/i.test(navigator.userAgent) ? 'mobile' : 'desktop' + } + }); + + // Store session ID in sessionStorage for later use + sessionStorage.setItem(SESSION_ID_KEY, sessionId); + + // End span when session ends + const handleBeforeUnload = () => { + span.end(); + }; + + window.addEventListener('beforeunload', handleBeforeUnload); + + // Clean up event listener when needed + return () => { + window.removeEventListener('beforeunload', handleBeforeUnload); + }; + } catch (error) { + console.error('Failed to track session:', error); + return () => {}; // Return no-op cleanup function + } +}; + +// Track user action +export const trackUserAction = (action: string, metadata?: AnalyticsMetadata): void => { + // Check if analytics are enabled + if (!isAnalyticsEnabled()) { + return; + } + + try { + const tracer = trace.getTracer('bakery-frontend'); + const userId = getCurrentUserId(); + + const span = tracer.startSpan('user_action', { + attributes: { + 'user.action': action, + 'user.id': userId || 'anonymous', + ...metadata + } + }); + + span.end(); + } catch (error) { + console.error('Failed to track user action:', error); + } +}; + +// Track user location (with consent) +export const trackUserLocation = async (): Promise => { + // Check if analytics are enabled + if (!isAnalyticsEnabled()) { + return; + } + + // Check if location tracking consent is granted + if (!isLocationTrackingConsentGranted()) { + console.log('Location tracking consent not granted'); + return; + } + + try { + const position = await new Promise((resolve, reject) => { + if (!navigator.geolocation) { + reject(new Error('Geolocation not supported')); + return; + } + + navigator.geolocation.getCurrentPosition(resolve, reject, { + enableHighAccuracy: false, + timeout: 10000, + maximumAge: 300000 // 5 minutes + }); + }); + + const tracer = trace.getTracer('bakery-frontend'); + const userId = getCurrentUserId(); + + const span = tracer.startSpan('user_location', { + attributes: { + 'user.id': userId || 'anonymous', + 'location.latitude': position.coords.latitude, + 'location.longitude': position.coords.longitude, + 'location.accuracy': position.coords.accuracy, + 'location.altitude': position.coords.altitude ?? undefined, + 'location.speed': position.coords.speed ?? undefined, + 'location.heading': position.coords.heading ?? undefined + } + }); + + span.end(); + } catch (error) { + console.log('Location access denied or unavailable:', error); + } +}; + +// Initialize analytics tracking +export const initializeAnalytics = (): (() => void) => { + // Track initial session + const cleanupSession = trackSession(); + + // Track initial page view + trackPageView(window.location.pathname); + + // Listen for route changes (for SPA navigation) + let previousUrl = window.location.href; + + // For hash-based routing + const handleHashChange = () => { + if (window.location.href !== previousUrl) { + trackPageView(window.location.pathname + window.location.search); + previousUrl = window.location.href; + } + }; + + // For history API-based routing (most common in React apps) + // Use proper typing for history state methods + const originalPushState = history.pushState.bind(history); + const handlePushState = function ( + this: History, + data: unknown, + unused: string, + url?: string | URL | null + ) { + originalPushState(data, unused, url); + setTimeout(() => { + if (window.location.href !== previousUrl) { + trackPageView(window.location.pathname + window.location.search); + previousUrl = window.location.href; + } + }, 0); + }; + + const originalReplaceState = history.replaceState.bind(history); + const handleReplaceState = function ( + this: History, + data: unknown, + unused: string, + url?: string | URL | null + ) { + originalReplaceState(data, unused, url); + setTimeout(() => { + if (window.location.href !== previousUrl) { + trackPageView(window.location.pathname + window.location.search); + previousUrl = window.location.href; + } + }, 0); + }; + + // Override history methods + history.pushState = handlePushState; + history.replaceState = handleReplaceState; + + // Add event listeners + window.addEventListener('hashchange', handleHashChange); + + // Track user consent for location if needed + if (isLocationTrackingConsentGranted()) { + trackUserLocation(); + } + + // Return cleanup function + return () => { + // Restore original history methods + history.pushState = originalPushState; + history.replaceState = originalReplaceState; + + // Remove event listeners + window.removeEventListener('hashchange', handleHashChange); + + // Clean up session tracking + cleanupSession(); + }; +}; + +// Function to track custom metrics using OpenTelemetry spans +export const trackCustomMetric = ( + name: string, + value: number, + attributes?: Record +): void => { + // Check if analytics are enabled + if (!isAnalyticsEnabled()) { + return; + } + + try { + // Record metric as a span with the value as an attribute + // This approach works well for browser-based metrics since + // the OpenTelemetry metrics API in browsers sends to the same collector + const tracer = trace.getTracer('bakery-frontend'); + const userId = getCurrentUserId(); + + const span = tracer.startSpan('custom_metric', { + attributes: { + 'metric.name': name, + 'metric.value': value, + 'user.id': userId || 'anonymous', + ...attributes + } + }); + + span.end(); + } catch (error) { + // Log error but don't fail - metrics are non-critical + console.warn('Failed to track custom metric:', error); + } +}; \ No newline at end of file diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index 2687a66f..67b0b6b3 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -51,10 +51,11 @@ export default defineConfig(({ mode }) => { build: { outDir: 'dist', // For production builds: ensure assets have correct paths - // Base path should be '/' for root deployment + // Base path should match the deployment URL + base: process.env.VITE_BASE_URL || '/', // In development mode: inline source maps for better debugging - // In production mode: external source maps - sourcemap: isDevelopment ? 'inline' : true, + // In production mode: external source maps (can be disabled with VITE_DISABLE_SOURCEMAPS) + sourcemap: process.env.VITE_DISABLE_SOURCEMAPS ? false : (isDevelopment ? 'inline' : true), // In development mode: disable minification for readable errors // In production mode: use esbuild minification minify: isDevelopment ? false : 'esbuild', diff --git a/gateway/app/main.py b/gateway/app/main.py index 6629ff05..55cdad60 100644 --- a/gateway/app/main.py +++ b/gateway/app/main.py @@ -25,7 +25,7 @@ from app.middleware.rate_limiting import APIRateLimitMiddleware from app.middleware.subscription import SubscriptionMiddleware from app.middleware.demo_middleware import DemoMiddleware from app.middleware.read_only_mode import ReadOnlyModeMiddleware -from app.routes import auth, tenant, registration, nominatim, subscription, demo, pos, geocoding, poi_context, webhooks +from app.routes import auth, tenant, registration, nominatim, subscription, demo, pos, geocoding, poi_context, webhooks, telemetry # Initialize logger logger = structlog.get_logger() @@ -169,6 +169,9 @@ app.include_router(demo.router, prefix="/api/v1", tags=["demo"]) # Webhook routes are defined with full /api/v1/webhooks/* paths for consistency app.include_router(webhooks.router, prefix="", tags=["webhooks"]) +# Include telemetry routes for frontend OpenTelemetry data +app.include_router(telemetry.router, prefix="/api/v1", tags=["telemetry"]) + # ================================================================ # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS diff --git a/gateway/app/middleware/auth.py b/gateway/app/middleware/auth.py index 95b68fcb..c71951bf 100644 --- a/gateway/app/middleware/auth.py +++ b/gateway/app/middleware/auth.py @@ -47,7 +47,10 @@ PUBLIC_ROUTES = [ "/api/v1/demo/accounts", "/api/v1/demo/sessions", "/api/v1/webhooks/stripe", # Stripe webhook endpoint - bypasses auth for signature verification - "/api/v1/webhooks/generic" # Generic webhook endpoint + "/api/v1/webhooks/generic", # Generic webhook endpoint + "/api/v1/telemetry/v1/traces", # Frontend telemetry traces - no auth for performance + "/api/v1/telemetry/v1/metrics", # Frontend telemetry metrics - no auth for performance + "/api/v1/telemetry/health" # Telemetry health check ] # Routes accessible with demo session (no JWT required, just demo session header) diff --git a/gateway/app/routes/telemetry.py b/gateway/app/routes/telemetry.py new file mode 100644 index 00000000..6d4c85d4 --- /dev/null +++ b/gateway/app/routes/telemetry.py @@ -0,0 +1,303 @@ +""" +Telemetry routes for API Gateway - Handles frontend telemetry data + +This module provides endpoints for: +- Receiving OpenTelemetry traces from frontend +- Proxying traces to Signoz OTel collector +- Providing a secure, authenticated endpoint for frontend telemetry +""" + +from fastapi import APIRouter, Request, HTTPException, status +from fastapi.responses import JSONResponse, Response +import httpx +import logging +import os +from typing import Optional + +from app.core.config import settings +from app.core.header_manager import header_manager +from shared.monitoring.metrics import MetricsCollector, create_metrics_collector + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/telemetry", tags=["telemetry"]) + +# Get Signoz OTel collector endpoint from environment or use default +SIGNOZ_OTEL_COLLECTOR = os.getenv( + "SIGNOZ_OTEL_COLLECTOR_URL", + "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" +) + +@router.post("/v1/traces") +async def receive_frontend_traces(request: Request): + """ + Receive OpenTelemetry traces from frontend and proxy to Signoz + + This endpoint: + - Accepts OTLP trace data from frontend + - Validates the request + - Proxies to Signoz OTel collector + - Handles errors gracefully + """ + + # Handle OPTIONS requests for CORS + if request.method == "OPTIONS": + return Response( + status_code=200, + headers={ + "Access-Control-Allow-Origin": settings.CORS_ORIGINS_LIST, + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type, Authorization, X-Tenant-ID", + "Access-Control-Allow-Credentials": "true", + "Access-Control-Max-Age": "86400" + } + ) + + try: + # Get the trace data from the request + body = await request.body() + + if not body: + logger.warning("Received empty trace data from frontend") + return JSONResponse( + status_code=400, + content={"error": "Empty trace data"} + ) + + # Log the trace reception (without sensitive data) + logger.info( + "Received frontend traces, content_length=%s, content_type=%s, user_agent=%s", + len(body), + request.headers.get("content-type"), + request.headers.get("user-agent") + ) + + # Forward to Signoz OTel collector + target_url = f"{SIGNOZ_OTEL_COLLECTOR}/v1/traces" + + # Set up headers for the Signoz collector + forward_headers = { + "Content-Type": request.headers.get("content-type", "application/json"), + "User-Agent": "bakery-gateway/1.0", + "X-Forwarded-For": request.headers.get("x-forwarded-for", "frontend"), + "X-Tenant-ID": request.headers.get("x-tenant-id", "unknown") + } + + # Add authentication if configured + signoz_auth_token = os.getenv("SIGNOZ_AUTH_TOKEN") + if signoz_auth_token: + forward_headers["Authorization"] = f"Bearer {signoz_auth_token}" + + # Send to Signoz collector + timeout_config = httpx.Timeout( + connect=5.0, + read=10.0, + write=5.0, + pool=5.0 + ) + + async with httpx.AsyncClient(timeout=timeout_config) as client: + response = await client.post( + url=target_url, + content=body, + headers=forward_headers + ) + + # Log the response from Signoz + logger.info( + "Forwarded traces to Signoz, signoz_status=%s, signoz_response_time=%s", + response.status_code, + response.elapsed.total_seconds() + ) + + # Return success response to frontend + return JSONResponse( + status_code=200, + content={ + "message": "Traces received and forwarded to Signoz", + "signoz_status": response.status_code, + "trace_count": 1 # We don't know exact count without parsing + } + ) + + except httpx.HTTPStatusError as e: + logger.error( + "Signoz collector returned error, status_code=%s, error_message=%s", + e.response.status_code, + str(e) + ) + return JSONResponse( + status_code=502, + content={ + "error": "Signoz collector error", + "details": str(e), + "signoz_status": e.response.status_code + } + ) + + except httpx.RequestError as e: + logger.error( + "Failed to connect to Signoz collector, error=%s, collector_url=%s", + str(e), + SIGNOZ_OTEL_COLLECTOR + ) + return JSONResponse( + status_code=503, + content={ + "error": "Signoz collector unavailable", + "details": str(e) + } + ) + + except Exception as e: + logger.error( + "Unexpected error processing traces, error=%s, error_type=%s", + str(e), + type(e).__name__ + ) + return JSONResponse( + status_code=500, + content={ + "error": "Internal server error", + "details": str(e) + } + ) + +@router.post("/v1/metrics") +async def receive_frontend_metrics(request: Request): + """ + Receive OpenTelemetry metrics from frontend and proxy to Signoz + """ + + # Handle OPTIONS requests for CORS + if request.method == "OPTIONS": + return Response( + status_code=200, + headers={ + "Access-Control-Allow-Origin": settings.CORS_ORIGINS_LIST, + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type, Authorization, X-Tenant-ID", + "Access-Control-Allow-Credentials": "true", + "Access-Control-Max-Age": "86400" + } + ) + + try: + body = await request.body() + + if not body: + return JSONResponse( + status_code=400, + content={"error": "Empty metrics data"} + ) + + logger.info( + "Received frontend metrics, content_length=%s, content_type=%s", + len(body), + request.headers.get("content-type") + ) + + # Forward to Signoz OTel collector + target_url = f"{SIGNOZ_OTEL_COLLECTOR}/v1/metrics" + + forward_headers = { + "Content-Type": request.headers.get("content-type", "application/json"), + "User-Agent": "bakery-gateway/1.0", + "X-Forwarded-For": request.headers.get("x-forwarded-for", "frontend"), + "X-Tenant-ID": request.headers.get("x-tenant-id", "unknown") + } + + # Add authentication if configured + signoz_auth_token = os.getenv("SIGNOZ_AUTH_TOKEN") + if signoz_auth_token: + forward_headers["Authorization"] = f"Bearer {signoz_auth_token}" + + timeout_config = httpx.Timeout( + connect=5.0, + read=10.0, + write=5.0, + pool=5.0 + ) + + async with httpx.AsyncClient(timeout=timeout_config) as client: + response = await client.post( + url=target_url, + content=body, + headers=forward_headers + ) + + logger.info( + "Forwarded metrics to Signoz, signoz_status=%s", + response.status_code + ) + + return JSONResponse( + status_code=200, + content={ + "message": "Metrics received and forwarded to Signoz", + "signoz_status": response.status_code + } + ) + + except Exception as e: + logger.error( + "Error processing metrics, error=%s", + str(e) + ) + return JSONResponse( + status_code=500, + content={ + "error": "Internal server error", + "details": str(e) + } + ) + +@router.get("/health") +async def telemetry_health(): + """ + Health check endpoint for telemetry service + """ + return JSONResponse( + status_code=200, + content={ + "status": "healthy", + "service": "telemetry-gateway", + "signoz_collector": SIGNOZ_OTEL_COLLECTOR + } + ) + +# Initialize metrics for this module +try: + metrics_collector = create_metrics_collector("gateway-telemetry") +except Exception as e: + logger.error("Failed to create metrics collector, error=%s", str(e)) + metrics_collector = None + +@router.on_event("startup") +async def startup_event(): + """Initialize telemetry metrics on startup""" + try: + if metrics_collector: + # Register telemetry-specific metrics + metrics_collector.register_counter( + "gateway_telemetry_traces_received", + "Number of trace batches received from frontend" + ) + metrics_collector.register_counter( + "gateway_telemetry_metrics_received", + "Number of metric batches received from frontend" + ) + metrics_collector.register_counter( + "gateway_telemetry_errors", + "Number of telemetry processing errors" + ) + + logger.info( + "Telemetry gateway initialized, signoz_collector=%s", + SIGNOZ_OTEL_COLLECTOR + ) + + except Exception as e: + logger.error( + "Failed to initialize telemetry metrics, error=%s", + str(e) + ) \ No newline at end of file diff --git a/infrastructure/helm/signoz-values-prod.yaml b/infrastructure/helm/signoz-values-prod.yaml index 5cbee072..b5b95afc 100644 --- a/infrastructure/helm/signoz-values-prod.yaml +++ b/infrastructure/helm/signoz-values-prod.yaml @@ -6,7 +6,7 @@ # Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml global: - storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class + storageClass: "microk8s-hostpath" # For MicroK8s, use "microk8s-hostpath" or custom storage class clusterName: "bakery-ia-prod" domain: "monitoring.bakewise.ai" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) diff --git a/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml b/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml index 7dc3702d..0425929c 100644 --- a/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml +++ b/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml @@ -140,10 +140,9 @@ spec: name: pos-integration-secrets - secretRef: name: whatsapp-secrets - volumeMounts: - - name: model-storage - mountPath: /app/models - readOnly: true # Forecasting only reads models + - secretRef: + name: minio-secrets + # Model storage now uses MinIO - no local volumeMounts needed resources: requests: memory: "512Mi" @@ -172,10 +171,7 @@ spec: secret: secretName: redis-tls-secret defaultMode: 0400 - - name: model-storage - persistentVolumeClaim: - claimName: model-storage - readOnly: true # Forecasting only reads models + # Model storage migrated to MinIO - PVC no longer needed --- apiVersion: v1 diff --git a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml index e4079599..9f7f2e94 100644 --- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml +++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml @@ -56,6 +56,11 @@ spec: configMapKeyRef: name: bakery-config key: OTEL_EXPORTER_OTLP_ENDPOINT + - name: SIGNOZ_OTEL_COLLECTOR_URL + valueFrom: + configMapKeyRef: + name: bakery-config + key: SIGNOZ_OTEL_COLLECTOR_URL resources: requests: memory: "256Mi" diff --git a/infrastructure/kubernetes/base/components/minio/minio-deployment.yaml b/infrastructure/kubernetes/base/components/minio/minio-deployment.yaml new file mode 100644 index 00000000..d19f3baa --- /dev/null +++ b/infrastructure/kubernetes/base/components/minio/minio-deployment.yaml @@ -0,0 +1,154 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage + app.kubernetes.io/part-of: bakery-ia +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage + template: + metadata: + labels: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage + spec: + # Init container to set up TLS certificates with correct permissions + initContainers: + - name: init-certs + image: busybox:1.36 + command: + - sh + - -c + - | + mkdir -p /certs/CAs + cp /certs-secret/minio-cert.pem /certs/public.crt + cp /certs-secret/minio-key.pem /certs/private.key + cp /certs-secret/ca-cert.pem /certs/CAs/ca.crt + chmod 600 /certs/private.key + chmod 644 /certs/public.crt /certs/CAs/ca.crt + volumeMounts: + - name: certs-secret + mountPath: /certs-secret + readOnly: true + - name: certs + mountPath: /certs + containers: + - name: minio + image: minio/minio:RELEASE.2024-11-07T00-52-20Z + args: + - server + - /data + - --console-address + - :9001 + - --address + - :9000 + - --certs-dir + - /certs + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_ROOT_USER + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_ROOT_PASSWORD + # Enable TLS for MinIO + - name: MINIO_SERVER_URL + value: "https://minio.bakery-ia.svc.cluster.local:9000" + - name: MINIO_BROWSER_REDIRECT_URL + value: "https://minio-console.bakery-ia.svc.cluster.local:9001" + ports: + - containerPort: 9000 + name: api + - containerPort: 9001 + name: console + volumeMounts: + - name: minio-data + mountPath: /data + - name: certs + mountPath: /certs + readOnly: true + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "2Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: /minio/health/live + port: 9000 + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /minio/health/ready + port: 9000 + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 15 + volumes: + - name: minio-data + persistentVolumeClaim: + claimName: minio-data + - name: certs-secret + secret: + secretName: minio-tls + - name: certs + emptyDir: {} + +--- +apiVersion: v1 +kind: Service +metadata: + name: minio + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage +spec: + type: ClusterIP + ports: + - port: 9000 + targetPort: 9000 + protocol: TCP + name: api + - port: 9001 + targetPort: 9001 + protocol: TCP + name: console + selector: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage + +--- +apiVersion: v1 +kind: Service +metadata: + name: minio-console + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage +spec: + type: ClusterIP + ports: + - port: 9001 + targetPort: 9001 + protocol: TCP + name: console + selector: + app.kubernetes.io/name: minio + app.kubernetes.io/component: storage \ No newline at end of file diff --git a/infrastructure/kubernetes/base/components/minio/minio-pvc.yaml b/infrastructure/kubernetes/base/components/minio/minio-pvc.yaml new file mode 100644 index 00000000..4db1f2a4 --- /dev/null +++ b/infrastructure/kubernetes/base/components/minio/minio-pvc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: minio-data + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio-data + app.kubernetes.io/component: storage + app.kubernetes.io/part-of: bakery-ia +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: standard \ No newline at end of file diff --git a/infrastructure/kubernetes/base/components/minio/minio-secrets.yaml b/infrastructure/kubernetes/base/components/minio/minio-secrets.yaml new file mode 100644 index 00000000..0bab607c --- /dev/null +++ b/infrastructure/kubernetes/base/components/minio/minio-secrets.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Secret +metadata: + name: minio-secrets + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio-secrets + app.kubernetes.io/component: storage + app.kubernetes.io/part-of: bakery-ia +type: Opaque +data: + # MinIO Root Credentials (base64 encoded) + MINIO_ROOT_USER: YWRtaW4= # admin + MINIO_ROOT_PASSWORD: c2VjdXJlLXBhc3N3b3Jk # secure-password + + # Service Account Credentials for applications + MINIO_ACCESS_KEY: dHJhaW5pbmctc2VydmljZQ== # training-service + MINIO_SECRET_KEY: dHJhaW5pbmctc2VjcmV0LWtleQ== # training-secret-key + + # Forecasting Service Credentials + FORECASTING_MINIO_ACCESS_KEY: Zm9yZWNhc3Rpbmctc2VydmljZQ== # forecasting-service + FORECASTING_MINIO_SECRET_KEY: Zm9yZWNhc3Rpbmctc2VjcmV0LWtleQ== # forecasting-secret-key \ No newline at end of file diff --git a/infrastructure/kubernetes/base/components/training/training-service.yaml b/infrastructure/kubernetes/base/components/training/training-service.yaml index 7a13ec47..becfa200 100644 --- a/infrastructure/kubernetes/base/components/training/training-service.yaml +++ b/infrastructure/kubernetes/base/components/training/training-service.yaml @@ -140,11 +140,11 @@ spec: name: pos-integration-secrets - secretRef: name: whatsapp-secrets + - secretRef: + name: minio-secrets volumeMounts: - name: tmp-storage mountPath: /tmp - - name: model-storage - mountPath: /app/models resources: requests: memory: "512Mi" @@ -176,9 +176,6 @@ spec: - name: tmp-storage emptyDir: sizeLimit: 4Gi # Increased from 2Gi to handle cmdstan temp files during optimization - - name: model-storage - persistentVolumeClaim: - claimName: model-storage --- apiVersion: v1 diff --git a/infrastructure/kubernetes/base/components/volumes/model-storage-pvc.yaml b/infrastructure/kubernetes/base/components/volumes/model-storage-pvc.yaml deleted file mode 100644 index de66c613..00000000 --- a/infrastructure/kubernetes/base/components/volumes/model-storage-pvc.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: model-storage - namespace: bakery-ia - labels: - app.kubernetes.io/name: model-storage - app.kubernetes.io/component: storage - app.kubernetes.io/part-of: bakery-ia -spec: - accessModes: - - ReadWriteOnce # Single node access (works with local Kubernetes) - resources: - requests: - storage: 10Gi # Adjust based on your needs - storageClassName: standard # Use default local-path provisioner diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml index 3b73f913..43ea8100 100644 --- a/infrastructure/kubernetes/base/configmap.yaml +++ b/infrastructure/kubernetes/base/configmap.yaml @@ -66,6 +66,17 @@ data: ALERT_PROCESSOR_DB_HOST: "alert-processor-db-service" AI_INSIGHTS_DB_HOST: "ai-insights-db-service" DISTRIBUTION_DB_HOST: "distribution-db-service" + DEMO_SESSION_DB_HOST: "demo-session-db-service" + + # MinIO Configuration + MINIO_ENDPOINT: "minio.bakery-ia.svc.cluster.local:9000" + MINIO_USE_SSL: "true" + MINIO_MODEL_BUCKET: "training-models" + MINIO_CONSOLE_PORT: "9001" + MINIO_API_PORT: "9000" + MINIO_REGION: "us-east-1" + MINIO_MODEL_LIFECYCLE_DAYS: "90" + MINIO_CACHE_TTL_SECONDS: "3600" # Database Configuration DB_PORT: "5432" @@ -238,7 +249,8 @@ data: # ================================================================ # MODEL STORAGE & TRAINING # ================================================================ - MODEL_STORAGE_PATH: "/app/models" + # Model storage is handled by MinIO (see MinIO Configuration section) + MODEL_STORAGE_BACKEND: "minio" MODEL_BACKUP_ENABLED: "true" MODEL_VERSIONING_ENABLED: "true" MAX_TRAINING_TIME_MINUTES: "30" @@ -416,6 +428,9 @@ data: # OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317" # OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" + # Gateway telemetry proxy configuration + SIGNOZ_OTEL_COLLECTOR_URL: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" + # Optional: Protocol overrides per signal # OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc" # OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc" diff --git a/infrastructure/kubernetes/base/jobs/minio-bucket-init-job.yaml b/infrastructure/kubernetes/base/jobs/minio-bucket-init-job.yaml new file mode 100644 index 00000000..1e96e033 --- /dev/null +++ b/infrastructure/kubernetes/base/jobs/minio-bucket-init-job.yaml @@ -0,0 +1,193 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: minio-bucket-init + namespace: bakery-ia + labels: + app.kubernetes.io/name: minio-bucket-init + app.kubernetes.io/component: storage-init + app.kubernetes.io/part-of: bakery-ia +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 3 + template: + metadata: + labels: + app.kubernetes.io/name: minio-bucket-init + app.kubernetes.io/component: storage-init + spec: + restartPolicy: OnFailure + initContainers: + # Wait for MinIO to be ready + - name: wait-for-minio + image: busybox:1.36 + command: + - sh + - -c + - | + echo "Waiting for MinIO to be ready..." + until nc -z minio.bakery-ia.svc.cluster.local 9000; do + echo "MinIO not ready, waiting..." + sleep 5 + done + echo "MinIO is ready!" + containers: + - name: bucket-init + image: minio/mc:RELEASE.2024-11-17T19-35-25Z + command: + - /bin/sh + - -c + - | + set -e + + echo "Configuring MinIO client..." + + # Configure mc alias with TLS (skip cert verification for self-signed) + mc alias set myminio https://minio.bakery-ia.svc.cluster.local:9000 \ + ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD} --insecure + + echo "Creating buckets..." + + # Create training-models bucket if not exists + if ! mc ls myminio/training-models --insecure 2>/dev/null; then + mc mb myminio/training-models --insecure + echo "Created bucket: training-models" + else + echo "Bucket already exists: training-models" + fi + + # Set bucket policy (private by default) + mc anonymous set none myminio/training-models --insecure + + # Enable versioning for model backups + mc version enable myminio/training-models --insecure + echo "Enabled versioning on training-models bucket" + + # Set lifecycle policy to expire old versions after 90 days + cat > /tmp/lifecycle.json << 'EOF' + { + "Rules": [ + { + "ID": "expire-old-versions", + "Status": "Enabled", + "Filter": { + "Prefix": "models/" + }, + "NoncurrentVersionExpiration": { + "NoncurrentDays": 90 + } + }, + { + "ID": "expire-old-metadata", + "Status": "Enabled", + "Filter": { + "Prefix": "models/" + }, + "Expiration": { + "ExpiredObjectDeleteMarker": true + } + } + ] + } + EOF + mc ilm import myminio/training-models < /tmp/lifecycle.json --insecure || true + echo "Lifecycle policy configured" + + # Create service accounts with limited permissions + echo "Creating service accounts..." + + # Training service policy (read/write models) + cat > /tmp/training-policy.json << 'EOF' + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListBucketMultipartUploads" + ], + "Resource": [ + "arn:aws:s3:::training-models", + "arn:aws:s3:::training-models/*" + ] + } + ] + } + EOF + + # Forecasting service policy (read-only models) + cat > /tmp/forecasting-policy.json << 'EOF' + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::training-models", + "arn:aws:s3:::training-models/*" + ] + } + ] + } + EOF + + # Create service accounts using credentials from secrets + echo "Creating service accounts..." + mc admin user add myminio ${TRAINING_MINIO_USER} ${TRAINING_MINIO_PASSWORD} --insecure 2>/dev/null || true + mc admin user add myminio ${FORECASTING_MINIO_USER} ${FORECASTING_MINIO_PASSWORD} --insecure 2>/dev/null || true + + # Apply policies (ignore errors if already exists) + mc admin policy create myminio training-policy /tmp/training-policy.json --insecure 2>/dev/null || true + mc admin policy attach myminio training-policy --user=${TRAINING_MINIO_USER} --insecure 2>/dev/null || true + + mc admin policy create myminio forecasting-policy /tmp/forecasting-policy.json --insecure 2>/dev/null || true + mc admin policy attach myminio forecasting-policy --user=${FORECASTING_MINIO_USER} --insecure 2>/dev/null || true + + echo "MinIO bucket initialization complete!" + + # List buckets for verification + echo "Current buckets:" + mc ls myminio --insecure + + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_ROOT_USER + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_ROOT_PASSWORD + # Training service MinIO credentials + - name: TRAINING_MINIO_USER + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_ACCESS_KEY + - name: TRAINING_MINIO_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secrets + key: MINIO_SECRET_KEY + # Forecasting service MinIO credentials + - name: FORECASTING_MINIO_USER + valueFrom: + secretKeyRef: + name: minio-secrets + key: FORECASTING_MINIO_ACCESS_KEY + - name: FORECASTING_MINIO_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secrets + key: FORECASTING_MINIO_SECRET_KEY diff --git a/infrastructure/kubernetes/base/kustomization.yaml b/infrastructure/kubernetes/base/kustomization.yaml index 2ca17e9d..3afb6c87 100644 --- a/infrastructure/kubernetes/base/kustomization.yaml +++ b/infrastructure/kubernetes/base/kustomization.yaml @@ -18,6 +18,13 @@ resources: # Additional configs - configs/postgres-init-config.yaml + + # MinIO Storage (with TLS) + - components/minio/minio-secrets.yaml + - secrets/minio-tls-secret.yaml + - components/minio/minio-pvc.yaml + - components/minio/minio-deployment.yaml + - jobs/minio-bucket-init-job.yaml # Migration jobs - migrations/auth-migration-job.yaml @@ -63,9 +70,6 @@ resources: - components/nominatim/nominatim.yaml - jobs/nominatim-init-job.yaml - # Persistent storage - - components/volumes/model-storage-pvc.yaml - # Cert manager cluster issuers - components/cert-manager/cluster-issuer-staging.yaml - components/cert-manager/local-ca-issuer.yaml diff --git a/infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml b/infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml new file mode 100644 index 00000000..d1075a5b --- /dev/null +++ b/infrastructure/kubernetes/base/secrets/minio-tls-secret.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls + namespace: bakery-ia + labels: + app.kubernetes.io/name: bakery-ia + app.kubernetes.io/component: minio-tls + app.kubernetes.io/part-of: bakery-ia +type: Opaque +data: + # MinIO TLS certificates (base64 encoded) + # Generated using infrastructure/tls/generate-minio-certificates.sh + # Valid for 3 years from generation date + # + # Certificate details: + # Subject: CN=minio.bakery-ia.svc.cluster.local, O=BakeryIA, OU=Storage + # Issuer: CN=BakeryIA-CA, O=BakeryIA, OU=Security + # + # To regenerate: + # 1. Run: infrastructure/tls/generate-minio-certificates.sh + # 2. Run: scripts/create-tls-secrets.sh + + ca-cert.pem: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZ5ekNDQTdPZ0F3SUJBZ0lVUGdPcU5ZK1pvS0J5UTFNZk84bGtpR2hPbXhJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2RURUxNQWtHQTFVRUJoTUNWVk14RXpBUkJnTlZCQWdNQ2tOaGJHbG1iM0p1YVdFeEZUQVRCZ05WQkFjTQpERk5oYmtaeVlXNWphWE5qYnpFUk1BOEdBMVVFQ2d3SVFtRnJaWEo1U1VFeEVUQVBCZ05WQkFzTUNGTmxZM1Z5CmFYUjVNUlF3RWdZRFZRUUREQXRDWVd0bGNubEpRUzFEUVRBZUZ3MHlOVEV3TVRneE5ESXlNVFJhRncwek5URXcKTVRZeE5ESXlNVFJhTUhVeEN6QUpCZ05WQkFZVEFsVlRNUk13RVFZRFZRUUlEQXBEWVd4cFptOXlibWxoTVJVdwpFd1lEVlFRSERBeFRZVzVHY21GdVkybHpZMjh4RVRBUEJnTlZCQW9NQ0VKaGEyVnllVWxCTVJFd0R3WURWUVFMCkRBaFRaV04xY21sMGVURVVNQklHQTFVRUF3d0xRbUZyWlhKNVNVRXRRMEV3Z2dJaU1BMEdDU3FHU0liM0RRRUIKQVFVQUE0SUNEd0F3Z2dJS0FvSUNBUURSRDVPMmVna1lnOUhOUlI1U1UwYkxuR0hqcHYvUmFnck03ZGh1c2FXbgpyZkRGNVZwVFo0czkvOXNPRUowTnlqdW9LWGFtb3VUd1IxbncxOUZkSDhmMWVvbWNRNGVLdzJIa3hveHFSMzR0ClJEYUFHejNiV08rcmFUUTRTeU1LN1hGTW92VVVpTGwrR08yM2wxQk5QZmh6a2NEa1o5N200MzRmMVFWbzk5dGIKaFY0YklMYW9GSXFmMDlNMEUxL2ZhQitKQ1I4WWtsN0xvWGd1ejNWUi9CVW5kMHZNc1RNV3VlRC8yblZ1VVpPMAowcFVtVFVCUTJRZDc2NTdrL0hXZC8xd2NFQUw5ZFhOUmJ4aEROZkdnYzNXdFFoZ2djcFlMUWFmTGE4MXRseHljCndEZ042UGRFbFVseGdYL091b1oxeWxNWkU3eHBzTXRwbjFBd2VvZFZibTNRcDVBMXlkeWJFNjF1MXVyWXoxTHQKV05aOWVPZkFxZXdpWVFIVlpXTUM0YTRTYSsyeU02cTVQWC80ZytUYklUaDhoWkp3WFBLNUVEaWc3dkYxNEpQbApsRVJOcHdpYTNuNmEwUDcwM0hQTjZya1FPNWtWVGRpVXNmaWJNdGNVSkhMeVdXUUFSQm15ZVZma0lDYWFlWUVsCkVMa3N3YTlOVkVTS3ZRYUhLU2lIWkZoRUkwYUF2Y3BBam0xRU9oRWEraFNSaE9vRnlVT3ZHK2NNT2ZjQlNtTDAKVW1sRC9sZmFuVFQwems1YXFzcEVrWEdlQnczMXJtWi8wQVpPalYycHBSeFdXZWt6bzlCZjdnNmVMVFk0VUNDNQpNeVB0em14OVRiWHJOQW5YaGlGNkxnNWgyOFI0MkdUZTVBZDZUSGtGOVMvS2hxOHUwZFk1U0EyR1VGMUViUU84Ckt3SURBUUFCbzFNd1VUQWRCZ05WSFE0RUZnUVVBKzZxL2tjOGZUUVUxRURxekdSZktRcHE2bTB3SHdZRFZSMGoKQkJnd0ZvQVVBKzZxL2tjOGZUUVUxRURxekdSZktRcHE2bTB3RHdZRFZSMFRBUUgvQkFVd0F3RUIvekFOQmdrcQpoa2lHOXcwQkFRc0ZBQU9DQWdFQVF1dkZoMitIUUZ5OFZUY1VnYWxFVmlheXQxelFHdjRySVNtaXEzRzZJZVhQClhTNGd3cUhrRnpUd1p2bW9oVHdtT0N3Vy94RjRLZ3htRmJ5V05yRUpKRXFjYmVkcVVXVi8wQkNhRm1KdlVkZEkKK2V4L2lEM0ZlYnU4QUZJK0o4bEJIL0NlbkRpU0xIaGd5c2VZOHV3Um5Yc3NoWDVSbkRpckYxdUtyMUo2MzVhbgpHbHlGSU5Vcm5RbGd1RXZ0cjBlbkdVbHpUNXJXajR5MEFXVWRiWGk4dlJzaldvUThKYTBCeFRyWVloL2tPL0ZJClB0cVg3d3N4b0pNREVRNzF6aHdhN1dMUWMyZGZiMnJBcjF1QmgzcU53aVZCSU5CK3QzSkZ2NzJ4cXNXZ3VySUIKSWYyc29SVEkybk1lNWdURzFEZmQrVjI0amZhL3lJZ0FzTWpDem1HUUsyMHZvYlg0c0FWbm1QVmJaZzlTTEZaaQpNaWRrbjlPOVU2OE1FT2UzSWFzY2xkN2ZwNUprK0hyYkpVNi9zMTZFRVIvQWdEM09vajN3UmdqVENTK0FERCtqCnhvMk84Vlgya1BvMDNBTitpWWEzbkptbE1GekNyelQrOFp4U25QNUZxR2cyRUNFYnFxQTBCLzVuYVZwbWRZYVYKNDFvRkxzd2NGbTJpcUdhd2JzTE45eDN0dklDdUU5M0hZazFqNzJQelhhaVNMdHB2YW1IMWRSWUMrSFVNMUwwTwo0OUNOTVlKZUwvTmx5UXVaSm0yWDBxRE5TWG1STUw4SFU5c093V1g2cFBQSk96dXF0Z2R4Lytsa0dBZDJ3WkpVCklWYm1MNlF2emRidGEvY1NWd3NMdEJ6RzQ4YTFiNEtCYzdXTEhUd2JyZEJSVGcwVGtMWTRrdkNaZTVuTmw0RT0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + + minio-cert.pem: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUdyVENDQkpXZ0F3SUJBZ0lVRytCME0ycnhucWpHZHRmbzBCaGV2S0N4MGdBd0RRWUpLb1pJaHZjTkFRRUwKQlFBd2RURUxNQWtHQTFVRUJoTUNWVk14RXpBUkJnTlZCQWdNQ2tOaGJHbG1iM0p1YVdFeEZUQVRCZ05WQkFjTQpERk5oYmtaeVlXNWphWE5qYnpFUk1BOEdBMVVFQ2d3SVFtRnJaWEo1U1VFeEVUQVBCZ05WQkFzTUNGTmxZM1Z5CmFYUjVNUlF3RWdZRFZRUUREQXRDWVd0bGNubEpRUzFEUVRBZUZ3MHlOakF4TVRjeE5EVTBORGhhRncweU9UQXgKTVRZeE5EVTBORGhhTUlHS01Rc3dDUVlEVlFRR0V3SlZVekVUTUJFR0ExVUVDQXdLUTJGc2FXWnZjbTVwWVRFVgpNQk1HQTFVRUJ3d01VMkZ1Um5KaGJtTnBjMk52TVJFd0R3WURWUVFLREFoQ1lXdGxjbmxKUVRFUU1BNEdBMVVFCkN3d0hVM1J2Y21GblpURXFNQ2dHQTFVRUF3d2hiV2x1YVc4dVltRnJaWEo1TFdsaExuTjJZeTVqYkhWemRHVnkKTG14dlkyRnNNSUlDSWpBTkJna3Foa2lHOXcwQkFRRUZBQU9DQWc4QU1JSUNDZ0tDQWdFQW5qdTd0cFF3dkYvVgprL011UmhySllWME1KcXRyRkovTlgrMU9MSmFNaEZYL0tZMTBMUCtCNjV3L3BsWkd5SnRidFVkV2o1d1pMclpRCm1KYjNwNFR0dUs0QlQxZ3UzYlNaS0lIUU5lQWc4MUtzTUdxKzV1WE9vUFdOckFoaDRoWU9KNDVtSXNZYmEwRGQKTzJNRnY5V3VXVm4zVDZGenpNN3FMZENKelpOamVhQjdtVEpqZEhHcjg0aVQ4NkFFQStIeXd2c3FPb2paZStVagpLdThYcmp4VUdSL2VQRnZRQ3lNZFdnRmJqd2lqSi9CbjhSQ0FSSXVpRXNzalNMUVdPZ1FncklBVHZFRi9jeVVkClpLR2hhYzMvNEk3MXhEV2hYNzFYV1l3T05FbXJRNmNHelhtdmNVTVY4SHZFV016YjA1UnBPWXp5bUtyYnhOTDQKZVdOYUt2cnZjWnpjTXpwSU00UmVHS3cyTjlzQUdzM1lCVFI3V1hMS1dnbkxZYnNvSHgzZGRadXlRK0hKd0RUWApxcFh1dFloYW9DZmZIMjNuTU1GaUFLMWltZWJCSTFoVWNBaVB2cFN4N2RJM21nTlA0YWZOL29xaE1PUGc4VHhtCndNZWt2cHovN2NXYkNPTmprZDlkcTBWTExTVyt0cUlmZlZRajBMT1VQdlhyTE9tUG1jTDZsU2xSTzg4NVRWdngKSkRidDJYVVJtaHFKenBhcklmTmhGOUVscEhtYnNkc2xtWVBvLzlKV1VtcmtiSjZBYWZkbEpuckNUR3hKcGl3TAowbEpveEl3dnFZdDhEQnVjMWNORktKSVNMWkl5bzZ1WFJ1TlZvTnByeGdmVXZsOENscDNnUyttSVNGZzMzdTJrCkpjYnF6bnZ2YzN0YmxIZTB4ZzJNSE1JVlRkWmlSamNDQXdFQUFhT0NBUjB3Z2dFWk1Bc0dBMVVkRHdRRUF3SUUKTURBZEJnTlZIU1VFRmpBVUJnZ3JCZ0VGQlFjREFRWUlLd1lCQlFVSEF3SXdnYW9HQTFVZEVRU0JvakNCbjRJaApiV2x1YVc4dVltRnJaWEo1TFdsaExuTjJZeTVqYkhWemRHVnlMbXh2WTJGc2dnOXRhVzVwYnk1aVlXdGxjbmt0CmFXR0NLVzFwYm1sdkxXTnZibk52YkdVdVltRnJaWEo1TFdsaExuTjJZeTVqYkhWemRHVnlMbXh2WTJGc2doZHQKYVc1cGJ5MWpiMjV6YjJ4bExtSmhhMlZ5ZVMxcFlZSUZiV2x1YVcrQ0RXMXBibWx2TFdOdmJuTnZiR1dDQ1d4dgpZMkZzYUc5emRJY0Vmd0FBQVRBZEJnTlZIUTRFRmdRVXJXMzNxOWkreE5MdVZjcGUrKzlxUE56dVF4VXdId1lEClZSMGpCQmd3Rm9BVUErNnEva2M4ZlRRVTFFRHF6R1JmS1FwcTZtMHdEUVlKS29aSWh2Y05BUUVMQlFBRGdnSUIKQUlTT0NieFJWd2xtaWdjNldLM3hUaUJxNlJGMGNzdnV5NjJNYnI3N0h0Q3VPNHgxOTI5QjAxMXd1djdnWEhmawpPQm9qa3ZwZnFQUXlRZTk2dGFwRGJqYWZpeStlSHBPSm1lQjFNN2lQKzEzTGJJRjN3alE5SXZ1TWtnN3FQczZXCk15cnBvd1ZwK1BPeDU2SlJRK3lPcm5nakgxRG9FMW45NDBJR0lTZkRmb2g3cTljMkNvSlA2cWo3YWxid1U4RU0KYlB5d3B4WkFTNjYydUtBR0VNcFNLK2NuMXdUU3ZWSDN6NDVrMk9yUmwvQ05PZ0Fad1dyNzdQK1A3bW9FSHlmUQplR0dpclJTWWswUkJtYzdOTGd0Ry9iV0JQTEt4dHIyQmZidDFwZFZXakd4TmlwaDR4c1Z0YldpNnVOeUxYNE1qCllyK0FVUjd1MHlCVWxSc1VUL1dDbkFYdnRmNzRwcWJaNDZ3YjFnajEreU1GWHRNUldVV2NFcU1GVXRJdEsrUngKSlA4bUErbW9qdEdOcGdJZG53b1pPMTBsQkZ2U0ZKL1hGUFlsbHFKOGJpWmJ3RDZtWElzei9WQmdDRHlyQ3kybwpQeVhzR29HNDdTZkovQldvdHUwRkNaZERreCtQU0k2bkdKdyt2empSVzJ3TU9tdzJiZ0xkK3dsVDNpTXp4V3VOCkNidk0wSmpTQ2J3YVMvdE84emtrNGROeVhkWWNQbkJPNVJlM1IrQUV3T0RxV2F4T0ZXYmVUWW10bHlOTXdNT04Kd2lpR3pLWjkwaHM5QSt6M2x0QldNNmxNOFBJaFplcHB1TEZNTDRMSjZ0Ti93anJrOEVVMFBNT2ZlUTVjWXprZAp3QXdiRjVXaVhDd2JtaERCbW4xVVBrMjdPQUV0TzRSM3luaXM0eGNJbmVTQwotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== + + minio-key.pem: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlKS2dJQkFBS0NBZ0VBbmp1N3RwUXd2Ri9Way9NdVJockpZVjBNSnF0ckZKL05YKzFPTEphTWhGWC9LWTEwCkxQK0I2NXcvcGxaR3lKdGJ0VWRXajV3WkxyWlFtSmIzcDRUdHVLNEJUMWd1M2JTWktJSFFOZUFnODFLc01HcSsKNXVYT29QV05yQWhoNGhZT0o0NW1Jc1liYTBEZE8yTUZ2OVd1V1ZuM1Q2Rnp6TTdxTGRDSnpaTmplYUI3bVRKagpkSEdyODRpVDg2QUVBK0h5d3ZzcU9valplK1VqS3U4WHJqeFVHUi9lUEZ2UUN5TWRXZ0ZiandpakovQm44UkNBClJJdWlFc3NqU0xRV09nUWdySUFUdkVGL2N5VWRaS0doYWMzLzRJNzF4RFdoWDcxWFdZd09ORW1yUTZjR3pYbXYKY1VNVjhIdkVXTXpiMDVScE9ZenltS3JieE5MNGVXTmFLdnJ2Y1p6Y016cElNNFJlR0t3Mk45c0FHczNZQlRSNwpXWExLV2duTFlic29IeDNkZFp1eVErSEp3RFRYcXBYdXRZaGFvQ2ZmSDIzbk1NRmlBSzFpbWViQkkxaFVjQWlQCnZwU3g3ZEkzbWdOUDRhZk4vb3FoTU9QZzhUeG13TWVrdnB6LzdjV2JDT05qa2Q5ZHEwVkxMU1crdHFJZmZWUWoKMExPVVB2WHJMT21QbWNMNmxTbFJPODg1VFZ2eEpEYnQyWFVSbWhxSnpwYXJJZk5oRjlFbHBIbWJzZHNsbVlQbwovOUpXVW1ya2JKNkFhZmRsSm5yQ1RHeEpwaXdMMGxKb3hJd3ZxWXQ4REJ1YzFjTkZLSklTTFpJeW82dVhSdU5WCm9OcHJ4Z2ZVdmw4Q2xwM2dTK21JU0ZnMzN1MmtKY2Jxem52dmMzdGJsSGUweGcyTUhNSVZUZFppUmpjQ0F3RUEKQVFLQ0FnQVhHQWE4amdKUzYvWERBeUlFejFJRzZNcW1OaXlKdFEwSGJCNFZ1ZDlHVFRyUmVMaTAvSkdjcnBCSAptWjM1RjF1YUtKQkVvM2ExYjV4eHVNN3FYeWRHNWZhQSt4RFVBTkM5cmJ5U3NHUit2dGtzczllcTRXMTM1bjdICjFlMWJUdmEvNVRPWTdhc0F5MVcrbmlRdnJHTW0zVStRQ3JOWTkvWUx1N3p4Q1FyaXJINTlqSEloZzVtaUVKUHYKWWJKVVVyellva20yZzFTaWxYMjlmV25LWHpteTlRaTliSFQvdXg5RWpLQXRUd2hwQXRoWXdaekc1RTVDU2UyYgpaZFU4b0crWVhaVUR5OWRyR2NhaGNrbVpwSndzelJDbmsyQTdGZXBTd25Nc1JIZy9obmdpc3hqZEFmcUl2N2VYCmNrYS9LWkQxK2xGSjROMzBhd29peFZKYXBZY2VwZk1hMS83dE1vZFFsOXdaOVZLWTZ6YlEwL1U0QndlMGQ0OEYKQ1graVlOZ2t4UWRmdVdwMFU2RkVlUTluR2tPMndZQUJxMCtzSDIxU2puRTQvTXh5anpLZCtjR08zUkdkTktxUwo5QTVubkh4MUwxVDN6Z0hOR2ZHS1F6Tzg5L09sVDBWVE80OEhkamxva0hmc3VTVG03N2tkZkU1TVFwamF2WktaCmo0QXoyWENGWkM2WkJxYm9wZlA1amVNWmI1WDU0aXVtclIwcHpRRGloQ3ZZWmYxTlVDa3hFdFZmaTF1eUtvLzYKMzhQK0pDcEtWSk1mYzhyYTFlWVRTV0ZaZDc1UXVMK1FtblpPVUNqQktXMnNQQTVGbERyTkVTdTQrREhCVVFtOApxdUxDUGdLaHA1TmVJRDVjcm5iVElYclVCb2tQdHpsWm10SEs5TFRYeTNPWkdXUmt5UUtDQVFFQTF0OFRhdWdCCmpMUVI2NXBTbGRXTDdVSnVGVlZUVW9DSlB5cHlOQjkvc1VsTC9Nd1RBbHlhWHoveU15Q2VCdWt3cnBMT1M0NHMKaG5kQlJOL3ZsdkRCaEovVjdYaDBEUWUvMGlqczRJdGNYQ1lpN3hFcWZOd1FQTUJEKzVyWkdKeU1iOEtLV3YwSwpBUnhES0k0YytLUkQwemQ1d1ZtelZSTjdLZlUzT3FXbGV1TjNMTFZqN3R6YU9kT2xSU0E3YWlCTS9odWQ1VFE5CkUwcEF3SDhIaGMxYW1qaUM4dEJsYUZlZ0lodXpJenhNU1hIUkJVcDNsaDMvb2UzNjM4Mm5zRUxjbE4xaFVWRGsKdDNUQVpjdHlYRkIzSEUydHpJdm9xRUpRN0Zkd3MwNUVQZXFIODFOekdjRlRNS1NieVJzNmtYYzhFQ0hPc2lYSAp6TDd5dlI3S1BmVHZhd0tDQVFFQXZJVlZRV3lpcU5ScTdTQkd3czg3WjVjZFlJOGdwSkI4bFlySklqaTRyVUVFCk14MmdVeCtYaHM5QTJSczQxZ1hsYXdvRWNqUDliZXJ2ZTYzMVZOV0M0K3Q5cFR2Vm9qcVhtcnZaNVVEN3V2Q0kKRlFPLy9JSUdqa0tFZkRwSUgvcWxEUlZlbEZTU1JjOVEvY0piZlNwS2JsYnJYZ1FtdG5KOWpsQkpFL1NMSW14UAo3OURVdGlmWmx5cFVRbDl5YzhSZzFSYmpyQWtjQVZhOVBHMXQ3cGhTanJkZHRKbXRVUmtFdGhYWTc3R3c5WHJUCjgwWlJHdkpIS0lsWlBmaHF2WlNGQzg4MVJJZ0lpRitCdWxobm16TUo0dmdYeXEwVCtRY1VGN0FBdFBRU0hyMHIKQm5wN1JlUDF5R201UDd0MjNmRU00Z0R1RENBUHQ0R1lZeUxFY2dpelpRS0NBUUVBaE9MVGJITnR1ZW9IaHpFYQowQ1dRY3p4NVBtSlZ0SmxmeUJ2bEkwMHp1SjMvQzZuZU84Q3ZqQ2JORUVlazA5dFZ5ekZwdWhxRWVPaTZDZkdBCmlGWC9LSmw5UVc4VVBwYkRVQ01WVkUxNzRsV0hsMWlEY1ZMY0MrWlFaUVBBTGROcm14YXlZRkZMNWFIbit1WGgKRHZqd0pXbVN1RHhVaDFJVUFyL3YxeXBvckJhUE5xdzcwSmJ2czRHc0haTXdpNUxNYXY4RGFLUWsvWkFYZWJWVwpIcThBMEk0UWxrREI1b1VDdVBWdWxXVU9QUUhSNWpiR3ZLVnkybCtHbnZEZU8wa3VpRFpkb0YrcUE3ZUY0YTZ2CjNGMjdQRnJpR0xXU1ByVTh2TjNiQ2xsbUpQQ3VBWk5qaE5NbU10Z3FySFpWZzI4OVN6RE5WeW04Wm1qVlVKY0IKTnM0TFh3S0NBUUVBdDRua0tBOFpDZC9NdmxJbk1qREorQit5ZFRqRG9oUWRod1lZcmgybEJ1QitzemxMeHNIQwpKM2lOL1JFNHMzNElEcjh3OXZMUThIdkRicGs5ZWJ0cGRIYm4yNysyVFB4WWIwZ21hc0ZxazJUc1IvRmZyL256CllmczJ1eStPMnJ1T2gzOWZsbkFEL0wxTGI5TVNlWGg4QUpMVkViSmU4ay9qRjNQb3dlbmFyOGZkeDNCOE4xL3kKd3U1dUhEU0szRlM3cFpwa1REQ09PR3QzVDJhR21iMW8yeE9Bd255L3RXM3pIVWVGN2s4RUp1clBnVkRiVTYyLwpRNkw4NUkxL2RsVXJkd1RrS25WNlFUTWl2UWFtei8zUHlVNmE4ekt3ZUVuQThSTGtqVWYyZ0VEUnE3d0JXbGtICkNIaU41NU9ldFpPaVpFSmRnQ2FTeHFrQWNMdi9uN29DMVFLQ0FRRUFxRkNHVDFWWG4yUGEwdFQ2ZCtvRnZYYTkKSENVMTFEbG9ad1hUOTY4cmhGOEJSazdLRVVvZXpFdjZiTUZsdUwzak9jMDNkUUs1WlF0anZUQkZKYlc3NVZMVgphcnR1U0xiVS9CVytnRGtZWmszQ241Z1B6QzlIbGRDa3MrS0lDOHJBcUNPdW9NRzc3SFlOVys3ckJLS3did2w1CmtDQW1uSmE2NWZZczdDWXpEOThmb0crVmxsc25VWCttMUxMZUtjclBEZWlpcW5kQmFTWi9NRVJnWmE2SXZid2kKMDVtNnFqL3ZXL1ZiV05iNVR4Z2N5MWpOOXpRbWJONFJ0Zmdzc3NKRmZzS3JNS0lxVnp1NkNMcEJ4eXBOUXZHYQo0S3UzVFZGcm9zaFlxWUpMVm1xVklYT1dWZk9IQTRMT2VpNmtDZTlHaTQydjdqS014M0dEK25CK1BWbVFXZz09Ci0tLS0tRU5EIFJTQSBQUklWQVRFIEtFWS0tLS0tCg== \ No newline at end of file diff --git a/infrastructure/kubernetes/overlays/dev/kustomization.yaml b/infrastructure/kubernetes/overlays/dev/kustomization.yaml index e181adfa..361148f5 100644 --- a/infrastructure/kubernetes/overlays/dev/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/dev/kustomization.yaml @@ -666,7 +666,7 @@ replicas: - name: tenant-service count: 1 - name: training-service - count: 1 + count: 2 # Safe with MinIO storage - name: forecasting-service count: 1 - name: sales-service diff --git a/infrastructure/kubernetes/overlays/prod/kustomization.yaml b/infrastructure/kubernetes/overlays/prod/kustomization.yaml index 43ebe17c..2b101877 100644 --- a/infrastructure/kubernetes/overlays/prod/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/prod/kustomization.yaml @@ -200,7 +200,7 @@ replicas: - name: tenant-service count: 2 - name: training-service - count: 2 + count: 3 # Safe with MinIO storage - no PVC conflicts - name: forecasting-service count: 3 - name: sales-service diff --git a/infrastructure/kubernetes/overlays/prod/storage-patch.yaml b/infrastructure/kubernetes/overlays/prod/storage-patch.yaml deleted file mode 100644 index 0cc89883..00000000 --- a/infrastructure/kubernetes/overlays/prod/storage-patch.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: model-storage - namespace: bakery-ia -spec: - storageClassName: microk8s-hostpath # MicroK8s storage class - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi # Increased for production (adjust based on your needs) diff --git a/infrastructure/signoz/dashboards/user-activity.json b/infrastructure/signoz/dashboards/user-activity.json index 0d0e0ef3..34850449 100644 --- a/infrastructure/signoz/dashboards/user-activity.json +++ b/infrastructure/signoz/dashboards/user-activity.json @@ -31,7 +31,7 @@ "y": 3, "w": 6, "h": 3, - "i": "api-calls-per-user", + "i": "user-actions", "moved": false, "static": false }, @@ -40,7 +40,16 @@ "y": 3, "w": 6, "h": 3, - "i": "session-duration", + "i": "page-views", + "moved": false, + "static": false + }, + { + "x": 0, + "y": 6, + "w": 12, + "h": 4, + "i": "geo-visitors", "moved": false, "static": false } @@ -51,7 +60,7 @@ "name": "service", "description": "Filter by service name", "type": "QUERY", - "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value", + "queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName", "customValue": "", "textboxValue": "", "showALLOption": true, @@ -59,7 +68,7 @@ "order": 1, "modificationUUID": "", "sort": "ASC", - "selectedValue": null + "selectedValue": "bakery-frontend" } }, "widgets": [ @@ -75,26 +84,26 @@ "builder": { "queryData": [ { - "dataSource": "metrics", + "dataSource": "traces", "queryName": "A", - "aggregateOperator": "sum", + "aggregateOperator": "count_distinct", "aggregateAttribute": { - "key": "active_users", - "dataType": "int64", - "type": "Gauge", - "isColumn": false + "key": "user.id", + "dataType": "string", + "type": "tag", + "isColumn": true }, - "timeAggregation": "latest", + "timeAggregation": "count_distinct", "spaceAggregation": "sum", "functions": [], "filters": { "items": [ { "key": { - "key": "service.name", + "key": "serviceName", "dataType": "string", - "type": "resource", - "isColumn": false + "type": "tag", + "isColumn": true }, "op": "=", "value": "{{.service}}" @@ -110,13 +119,13 @@ "orderBy": [], "groupBy": [ { - "key": "service.name", + "key": "serviceName", "dataType": "string", - "type": "resource", - "isColumn": false + "type": "tag", + "isColumn": true } ], - "legend": "{{service.name}}", + "legend": "{{serviceName}}", "reduceTo": "sum" } ], @@ -139,16 +148,16 @@ "builder": { "queryData": [ { - "dataSource": "metrics", + "dataSource": "traces", "queryName": "A", - "aggregateOperator": "sum", + "aggregateOperator": "count", "aggregateAttribute": { - "key": "user_sessions_total", - "dataType": "int64", - "type": "Counter", - "isColumn": false + "key": "session.id", + "dataType": "string", + "type": "tag", + "isColumn": true }, - "timeAggregation": "sum", + "timeAggregation": "count", "spaceAggregation": "sum", "functions": [], "filters": { @@ -162,6 +171,16 @@ }, "op": "=", "value": "{{.service}}" + }, + { + "key": { + "key": "span.name", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "op": "=", + "value": "user_session" } ], "op": "AND" @@ -192,9 +211,9 @@ "yAxisUnit": "none" }, { - "id": "api-calls-per-user", - "title": "API Calls per User", - "description": "Average API calls per user by service", + "id": "user-actions", + "title": "User Actions", + "description": "Total user actions by service", "isStacked": false, "nullZeroValues": "zero", "opacity": "1", @@ -203,17 +222,17 @@ "builder": { "queryData": [ { - "dataSource": "metrics", + "dataSource": "traces", "queryName": "A", - "aggregateOperator": "avg", + "aggregateOperator": "count", "aggregateAttribute": { - "key": "api_calls_per_user", - "dataType": "float64", - "type": "Gauge", - "isColumn": false + "key": "user.action", + "dataType": "string", + "type": "tag", + "isColumn": true }, - "timeAggregation": "avg", - "spaceAggregation": "avg", + "timeAggregation": "count", + "spaceAggregation": "sum", "functions": [], "filters": { "items": [ @@ -226,6 +245,16 @@ }, "op": "=", "value": "{{.service}}" + }, + { + "key": { + "key": "span.name", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "op": "=", + "value": "user_action" } ], "op": "AND" @@ -245,7 +274,7 @@ } ], "legend": "{{serviceName}}", - "reduceTo": "avg" + "reduceTo": "sum" } ], "queryFormulas": [] @@ -256,9 +285,9 @@ "yAxisUnit": "none" }, { - "id": "session-duration", - "title": "Session Duration", - "description": "Average session duration by service", + "id": "page-views", + "title": "Page Views", + "description": "Total page views by service", "isStacked": false, "nullZeroValues": "zero", "opacity": "1", @@ -267,17 +296,17 @@ "builder": { "queryData": [ { - "dataSource": "metrics", + "dataSource": "traces", "queryName": "A", - "aggregateOperator": "avg", + "aggregateOperator": "count", "aggregateAttribute": { - "key": "session_duration_seconds", - "dataType": "float64", - "type": "Gauge", - "isColumn": false + "key": "page.path", + "dataType": "string", + "type": "tag", + "isColumn": true }, - "timeAggregation": "avg", - "spaceAggregation": "avg", + "timeAggregation": "count", + "spaceAggregation": "sum", "functions": [], "filters": { "items": [ @@ -290,6 +319,16 @@ }, "op": "=", "value": "{{.service}}" + }, + { + "key": { + "key": "span.name", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "op": "=", + "value": "page_view" } ], "op": "AND" @@ -309,7 +348,7 @@ } ], "legend": "{{serviceName}}", - "reduceTo": "avg" + "reduceTo": "sum" } ], "queryFormulas": [] @@ -317,7 +356,74 @@ "queryType": "builder" }, "fillSpans": false, - "yAxisUnit": "seconds" + "yAxisUnit": "none" + }, + { + "id": "geo-visitors", + "title": "Geolocation Visitors", + "description": "Number of visitors who shared location data", + "isStacked": false, + "nullZeroValues": "zero", + "opacity": "1", + "panelTypes": "value", + "query": { + "builder": { + "queryData": [ + { + "dataSource": "traces", + "queryName": "A", + "aggregateOperator": "count", + "aggregateAttribute": { + "key": "user.id", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "timeAggregation": "count", + "spaceAggregation": "sum", + "functions": [], + "filters": { + "items": [ + { + "key": { + "key": "serviceName", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "op": "=", + "value": "{{.service}}" + }, + { + "key": { + "key": "span.name", + "dataType": "string", + "type": "tag", + "isColumn": true + }, + "op": "=", + "value": "user_location" + } + ], + "op": "AND" + }, + "expression": "A", + "disabled": false, + "having": [], + "stepInterval": 60, + "limit": null, + "orderBy": [], + "groupBy": [], + "legend": "Visitors with Location Data (See GEOLOCATION_VISUALIZATION_GUIDE.md for map integration)", + "reduceTo": "sum" + } + ], + "queryFormulas": [] + }, + "queryType": "builder" + }, + "fillSpans": false, + "yAxisUnit": "none" } ] } \ No newline at end of file diff --git a/infrastructure/tls/ca/ca-cert.srl b/infrastructure/tls/ca/ca-cert.srl index 2ba9324d..7db51191 100644 --- a/infrastructure/tls/ca/ca-cert.srl +++ b/infrastructure/tls/ca/ca-cert.srl @@ -1 +1 @@ -1BE074336AF19EA8C676D7E8D0185EBCA0B1D1FF +1BE074336AF19EA8C676D7E8D0185EBCA0B1D202 diff --git a/infrastructure/tls/generate-minio-certificates.sh b/infrastructure/tls/generate-minio-certificates.sh new file mode 100755 index 00000000..45cc3026 --- /dev/null +++ b/infrastructure/tls/generate-minio-certificates.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash + +# Generate MinIO TLS certificates using existing CA +# This script generates certificates for MinIO server + +set -e + +TLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CA_DIR="$TLS_DIR/ca" +MINIO_DIR="$TLS_DIR/minio" + +mkdir -p "$MINIO_DIR" + +echo "Generating MinIO TLS certificates using existing CA..." +echo "CA Directory: $CA_DIR" +echo "MinIO Directory: $MINIO_DIR" +echo "" + +# Check if CA exists +if [ ! -f "$CA_DIR/ca-cert.pem" ] || [ ! -f "$CA_DIR/ca-key.pem" ]; then + echo "ERROR: CA certificates not found. Please run generate-certificates.sh first." + exit 1 +fi + +# Generate MinIO server private key +echo "Step 1: Generating MinIO server private key..." +openssl genrsa -out "$MINIO_DIR/minio-key.pem" 4096 + +# Convert to traditional RSA format (required by MinIO) +echo "Step 1b: Converting private key to traditional RSA format..." +openssl rsa -in "$MINIO_DIR/minio-key.pem" -traditional -out "$MINIO_DIR/minio-key.pem" + +# Create certificate signing request (CSR) +echo "Step 2: Creating MinIO certificate signing request..." +openssl req -new -key "$MINIO_DIR/minio-key.pem" -out "$MINIO_DIR/minio.csr" \ + -subj "/C=US/ST=California/L=SanFrancisco/O=BakeryIA/OU=Storage/CN=minio.bakery-ia.svc.cluster.local" + +# Create SAN (Subject Alternative Names) configuration for MinIO +cat > "$MINIO_DIR/san.cnf" <2 hours)...") + subprocess.run(['docker', 'image', 'prune', '-a', '-f', + '--filter', 'until=2h'], + capture_output=True, text=True) + + if verbose: + print("โœ… Docker image cleanup completed") + return True + except Exception as e: + print(f"โš ๏ธ Docker image cleanup failed: {e}") + return False + +def cleanup_docker_containers(verbose=False): + """Clean up stopped containers""" + if verbose: + print("๐Ÿงน Cleaning up Docker containers...") + + try: + # Remove stopped containers + if verbose: + print(" Removing stopped containers...") + subprocess.run(['docker', 'container', 'prune', '-f'], + capture_output=True, text=True) + + # Remove old containers (older than 1 hour) + if verbose: + print(" Removing old containers (>1 hour)...") + subprocess.run(['docker', 'container', 'prune', '-f', + '--filter', 'until=1h'], + capture_output=True, text=True) + + if verbose: + print("โœ… Docker container cleanup completed") + return True + except Exception as e: + print(f"โš ๏ธ Docker container cleanup failed: {e}") + return False + +def cleanup_docker_volumes(verbose=False): + """Clean up unused volumes""" + if verbose: + print("๐Ÿงน Cleaning up Docker volumes...") + + try: + # Remove unused volumes + if verbose: + print(" Removing unused volumes...") + subprocess.run(['docker', 'volume', 'prune', '-f'], + capture_output=True, text=True) + + if verbose: + print("โœ… Docker volume cleanup completed") + return True + except Exception as e: + print(f"โš ๏ธ Docker volume cleanup failed: {e}") + return False + +def cleanup_docker_system(verbose=False): + """Clean up Docker system (build cache, networks, etc.)""" + if verbose: + print("๐Ÿงน Cleaning up Docker system...") + + try: + # Remove build cache + if verbose: + print(" Removing build cache...") + subprocess.run(['docker', 'builder', 'prune', '-f'], + capture_output=True, text=True) + + # Remove unused networks + if verbose: + print(" Removing unused networks...") + subprocess.run(['docker', 'network', 'prune', '-f'], + capture_output=True, text=True) + + if verbose: + print("โœ… Docker system cleanup completed") + return True + except Exception as e: + print(f"โš ๏ธ Docker system cleanup failed: {e}") + return False + +def cleanup_kubernetes_resources(verbose=False): + """Clean up Kubernetes resources""" + if verbose: + print("๐Ÿงน Cleaning up Kubernetes resources...") + + try: + # Remove completed jobs older than 1 hour + if verbose: + print(" Removing completed jobs (>1 hour)...") + subprocess.run(['kubectl', 'delete', 'jobs', '-n', 'bakery-ia', + '--field-selector=status.successful=1'], + capture_output=True, text=True) + + # Remove failed jobs older than 1 hour + if verbose: + print(" Removing failed jobs (>1 hour)...") + subprocess.run(['kubectl', 'delete', 'jobs', '-n', 'bakery-ia', + '--field-selector=status.failed>0'], + capture_output=True, text=True) + + if verbose: + print("โœ… Kubernetes resource cleanup completed") + return True + except Exception as e: + print(f"โš ๏ธ Kubernetes resource cleanup failed: {e}") + return False + +def perform_cleanup(manual=False, threshold_gb=10, verbose=False): + """Perform comprehensive cleanup""" + + print("\n" + "="*60) + print("๐Ÿš€ STARTING COMPREHENSIVE CLEANUP") + print("="*60) + + if manual: + print("๐ŸŽ›๏ธ Mode: MANUAL (forced cleanup)") + else: + print("๐ŸŽ›๏ธ Mode: AUTOMATIC (threshold-based)") + + print(f"๐Ÿ“Š Threshold: {threshold_gb}GB free space") + + # Check disk space before cleanup + free_space_before = get_disk_space() + print(f"๐Ÿ“Š Disk space before cleanup: {free_space_before:.1f}GB free") + + # Check if cleanup is needed (unless manual) + if not manual and free_space_before >= threshold_gb: + print("โœ… Sufficient disk space available, skipping cleanup") + return True + + cleanup_results = [] + + # Perform all cleanup operations + cleanup_results.append(("Docker Images", cleanup_docker_images(verbose))) + cleanup_results.append(("Docker Containers", cleanup_docker_containers(verbose))) + cleanup_results.append(("Docker Volumes", cleanup_docker_volumes(verbose))) + cleanup_results.append(("Docker System", cleanup_docker_system(verbose))) + cleanup_results.append(("Kubernetes Resources", cleanup_kubernetes_resources(verbose))) + + # Check disk space after cleanup + free_space_after = get_disk_space() + space_reclaimed = free_space_after - free_space_before + + print(f"\n๐Ÿ“Š Disk space after cleanup: {free_space_after:.1f}GB free") + print(f"๐ŸŽฏ Space reclaimed: {space_reclaimed:.1f}GB") + + # Summary + print("\n๐Ÿ“‹ CLEANUP SUMMARY:") + for name, success in cleanup_results: + status = "โœ… SUCCESS" if success else "โŒ FAILED" + print(f" {name}: {status}") + + print("="*60) + print("โœ… CLEANUP COMPLETED") + print("="*60 + "\n") + + return True + +def main(): + parser = argparse.ArgumentParser( + description='Bakery IA Disk Space Cleanup Script', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + ./cleanup_disk_space.py # Automatic cleanup (checks threshold) + ./cleanup_disk_space.py --manual # Force cleanup regardless of threshold + ./cleanup_disk_space.py --threshold 5 # Use 5GB threshold + ./cleanup_disk_space.py --verbose # Verbose output + """ + ) + + parser.add_argument('--manual', action='store_true', + help='Force cleanup regardless of disk space threshold') + parser.add_argument('--threshold', type=int, default=10, + help='Minimum free space required in GB (default: 10)') + parser.add_argument('--verbose', action='store_true', + help='Enable verbose output') + + args = parser.parse_args() + + # Get threshold from environment variable if set + env_threshold = os.getenv('TILT_DISK_THRESHOLD_GB') + if env_threshold: + try: + args.threshold = int(env_threshold) + except ValueError: + pass + + # Get verbose from environment variable if set + env_verbose = os.getenv('TILT_CLEANUP_VERBOSE', 'false').lower() + if env_verbose == 'true': + args.verbose = True + + return perform_cleanup( + manual=args.manual, + threshold_gb=args.threshold, + verbose=args.verbose + ) + +if __name__ == '__main__': + success = main() + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/services/forecasting/Dockerfile b/services/forecasting/Dockerfile index 45dcc147..c1edb86d 100644 --- a/services/forecasting/Dockerfile +++ b/services/forecasting/Dockerfile @@ -1,10 +1,10 @@ -# Forecasting Dockerfile -# Add this stage at the top of each service Dockerfile +# Forecasting Service Dockerfile with MinIO Support +# Multi-stage build for optimized production image FROM python:3.11-slim AS shared WORKDIR /shared COPY shared/ /shared/ -# Then your main service stage +# Main service stage FROM python:3.11-slim WORKDIR /app diff --git a/services/forecasting/app/core/config.py b/services/forecasting/app/core/config.py index 61bdf12c..021a9aae 100644 --- a/services/forecasting/app/core/config.py +++ b/services/forecasting/app/core/config.py @@ -49,6 +49,18 @@ class ForecastingSettings(BaseServiceSettings): PREDICTION_CACHE_TTL_HOURS: int = int(os.getenv("PREDICTION_CACHE_TTL_HOURS", "6")) FORECAST_BATCH_SIZE: int = int(os.getenv("FORECAST_BATCH_SIZE", "100")) + # MinIO Configuration + MINIO_ENDPOINT: str = os.getenv("MINIO_ENDPOINT", "minio.bakery-ia.svc.cluster.local:9000") + MINIO_ACCESS_KEY: str = os.getenv("FORECASTING_MINIO_ACCESS_KEY", "forecasting-service") + MINIO_SECRET_KEY: str = os.getenv("FORECASTING_MINIO_SECRET_KEY", "forecasting-secret-key") + MINIO_USE_SSL: bool = os.getenv("MINIO_USE_SSL", "true").lower() == "true" + MINIO_MODEL_BUCKET: str = os.getenv("MINIO_MODEL_BUCKET", "training-models") + MINIO_CONSOLE_PORT: str = os.getenv("MINIO_CONSOLE_PORT", "9001") + MINIO_API_PORT: str = os.getenv("MINIO_API_PORT", "9000") + MINIO_REGION: str = os.getenv("MINIO_REGION", "us-east-1") + MINIO_MODEL_LIFECYCLE_DAYS: int = int(os.getenv("MINIO_MODEL_LIFECYCLE_DAYS", "90")) + MINIO_CACHE_TTL_SECONDS: int = int(os.getenv("MINIO_CACHE_TTL_SECONDS", "3600")) + # Real-time Forecasting REALTIME_FORECASTING_ENABLED: bool = os.getenv("REALTIME_FORECASTING_ENABLED", "true").lower() == "true" FORECAST_UPDATE_INTERVAL_HOURS: int = int(os.getenv("FORECAST_UPDATE_INTERVAL_HOURS", "6")) diff --git a/services/forecasting/app/services/prediction_service.py b/services/forecasting/app/services/prediction_service.py index dcfe6010..1c959686 100644 --- a/services/forecasting/app/services/prediction_service.py +++ b/services/forecasting/app/services/prediction_service.py @@ -16,6 +16,7 @@ import httpx from pathlib import Path import os import joblib +import io from app.core.config import settings from shared.monitoring.metrics import MetricsCollector @@ -578,118 +579,114 @@ class PredictionService: return adjusted async def _load_model(self, model_id: str, model_path: str): - """Load model from file with improved validation and error handling""" - - # Enhanced model file validation - if not await self._validate_model_file(model_path): - logger.error(f"Model file not valid: {model_path}") - return None - + """Load model from MinIO with improved validation and error handling""" + # Check cache first if model_id in self.model_cache: cached_model, cached_time = self.model_cache[model_id] if (datetime.now() - cached_time).seconds < self.cache_ttl: + logger.debug(f"Model loaded from cache: {model_id}") return cached_model - + + # Validate MinIO path format + if not await self._validate_model_file(model_path): + logger.error(f"Model path not valid: {model_path}") + return None + try: - if os.path.exists(model_path): - # Try multiple loading methods for compatibility - model = await self._load_model_safely(model_path) - - if model is None: - logger.error(f"Failed to load model from: {model_path}") - return None - - # Cache the model - self.model_cache[model_id] = (model, datetime.now()) - logger.info(f"Model loaded successfully: {model_path}") - return model - else: - logger.error(f"Model file not found: {model_path}") + # Load from MinIO + model = await self._load_model_safely(model_path) + + if model is None: + logger.error(f"Failed to load model from MinIO: {model_path}") return None - + + # Cache the model + self.model_cache[model_id] = (model, datetime.now()) + logger.info(f"Model loaded successfully from MinIO: {model_path}") + return model + except Exception as e: - logger.error(f"Error loading model: {e}") + logger.error(f"Error loading model from MinIO: {e}") return None async def _load_model_safely(self, model_path: str): - """Safely load model with multiple fallback methods""" - - # Method 1: Try joblib first (recommended for sklearn/Prophet models) + """Load model from MinIO storage (clean implementation - MinIO only)""" try: - logger.debug(f"Attempting to load model with joblib: {model_path}") - model = joblib.load(model_path) - logger.info(f"Model loaded successfully with joblib") - return model + # Parse MinIO path: minio://bucket_name/object_path + _, bucket_and_path = model_path.split("://", 1) + bucket_name, object_name = bucket_and_path.split("/", 1) + + logger.debug(f"Loading model from MinIO: {bucket_name}/{object_name}") + + # Use MinIO client + from shared.clients.minio_client import minio_client + + # Download model data + model_data = minio_client.get_object(bucket_name, object_name) + if not model_data: + logger.error(f"Failed to download model from MinIO: {model_path}") + return None + + # Try joblib first (using BytesIO since joblib.load reads from file-like objects) + try: + buffer = io.BytesIO(model_data) + model = joblib.load(buffer) + logger.info(f"Model loaded successfully from MinIO with joblib") + return model + except Exception as e: + logger.warning(f"Joblib loading from MinIO failed: {e}") + + # Try pickle as fallback + try: + model = pickle.loads(model_data) + logger.info(f"Model loaded successfully from MinIO with pickle") + return model + except Exception as e: + logger.warning(f"Pickle loading from MinIO failed: {e}") + + logger.error(f"All loading methods failed for MinIO object: {model_path}") + return None + except Exception as e: - logger.warning(f"Joblib loading failed: {e}") - - # Method 2: Try pickle as fallback - try: - logger.debug(f"Attempting to load model with pickle: {model_path}") - with open(model_path, 'rb') as f: - model = pickle.load(f) - logger.info(f"Model loaded successfully with pickle") - return model - except Exception as e: - logger.warning(f"Pickle loading failed: {e}") - - # Method 3: Try pandas pickle (for Prophet models saved with pandas) - try: - logger.debug(f"Attempting to load model with pandas: {model_path}") - import pandas as pd - model = pd.read_pickle(model_path) - logger.info(f"Model loaded successfully with pandas") - return model - except Exception as e: - logger.warning(f"Pandas loading failed: {e}") - - logger.error(f"All loading methods failed for: {model_path}") - return None + logger.error(f"Failed to load model from MinIO: {model_path}, error: {e}") + return None async def _validate_model_file(self, model_path: str) -> bool: - """Enhanced model file validation""" + """Validate MinIO model path and check object exists""" try: - if not os.path.exists(model_path): - logger.error(f"Model file not found: {model_path}") + # Validate MinIO path format + if not model_path.startswith("minio://"): + logger.error(f"Invalid model path format (expected minio://): {model_path}") return False - - # Check file size (should be > 1KB for a trained model) - file_size = os.path.getsize(model_path) - if file_size < 1024: - logger.warning(f"Model file too small ({file_size} bytes): {model_path}") - return False - - # More comprehensive file format detection + + # Parse MinIO path try: - with open(model_path, 'rb') as f: - header = f.read(16) # Read more bytes for better detection - - # Check for various pickle/joblib signatures - valid_signatures = [ - b']\x93PICKLE', # Joblib - b'\x80\x03', # Pickle protocol 3 - b'\x80\x04', # Pickle protocol 4 - b'\x80\x05', # Pickle protocol 5 - b'}\x94', # Newer joblib format - b'}\x93', # Alternative joblib format - ] - - is_valid_format = any(header.startswith(sig) for sig in valid_signatures) - - if not is_valid_format: - # Log header for debugging but don't fail validation - logger.warning(f"Unrecognized file header: {header[:8]} for {model_path}") - logger.info("Proceeding with loading attempt despite unrecognized header") - # Return True to allow loading attempt - some valid files may have different headers - return True - - return True - - except Exception as e: - logger.error(f"Error reading model file header: {e}") + _, bucket_and_path = model_path.split("://", 1) + bucket_name, object_name = bucket_and_path.split("/", 1) + except ValueError: + logger.error(f"Cannot parse MinIO path: {model_path}") return False - + + # Check if object exists in MinIO + from shared.clients.minio_client import minio_client + + if not minio_client.object_exists(bucket_name, object_name): + logger.error(f"Model object not found in MinIO: {bucket_name}/{object_name}") + return False + + # Check object metadata for size validation + metadata = minio_client.get_object_metadata(bucket_name, object_name) + if metadata: + file_size = metadata.get("size", 0) + if file_size < 1024: + logger.warning(f"Model object too small ({file_size} bytes): {model_path}") + return False + + logger.debug(f"Model validated in MinIO: {bucket_name}/{object_name}, size={file_size}") + + return True + except Exception as e: logger.error(f"Model validation error: {e}") return False diff --git a/services/forecasting/requirements.txt b/services/forecasting/requirements.txt index 9c077952..71910bc8 100644 --- a/services/forecasting/requirements.txt +++ b/services/forecasting/requirements.txt @@ -31,6 +31,7 @@ scikit-learn==1.6.1 pandas==2.2.3 numpy==2.2.2 joblib==1.4.2 +minio==7.2.2 # Messaging aio-pika==9.4.3 diff --git a/services/training/Dockerfile b/services/training/Dockerfile index 6bb905e9..7662e2a2 100644 --- a/services/training/Dockerfile +++ b/services/training/Dockerfile @@ -1,10 +1,10 @@ -# Training Dockerfile -# Add this stage at the top of each service Dockerfile +# Training Service Dockerfile with MinIO Support +# Multi-stage build for optimized production image FROM python:3.11-slim AS shared WORKDIR /shared COPY shared/ /shared/ -# Then your main service stage +# Main service stage FROM python:3.11-slim WORKDIR /app diff --git a/services/training/README.md b/services/training/README.md index e93e3f70..4483dfd6 100644 --- a/services/training/README.md +++ b/services/training/README.md @@ -116,29 +116,51 @@ async def broadcast_training_progress(job_id: str, progress: dict): await websocket_manager.broadcast(job_id, message) ``` -### Model Artifact Management +### Model Artifact Management (MinIO Storage) ```python -# Model storage and retrieval +# Model storage and retrieval using MinIO import joblib -from pathlib import Path +from shared.clients.minio_client import minio_client -# Save trained model +# Save trained model to MinIO def save_model_artifact(model: Prophet, tenant_id: str, product_id: str) -> str: - """Serialize and store model""" - model_dir = Path(f"/models/{tenant_id}/{product_id}") - model_dir.mkdir(parents=True, exist_ok=True) - + """Serialize and store model in MinIO""" + import io version = datetime.utcnow().strftime("%Y%m%d_%H%M%S") - model_path = model_dir / f"model_v{version}.pkl" + model_id = str(uuid.uuid4()) + object_name = f"models/{tenant_id}/{product_id}/{model_id}.pkl" - joblib.dump(model, model_path) - return str(model_path) + # Serialize model (joblib.dump writes to file-like objects) + buffer = io.BytesIO() + joblib.dump(model, buffer) + model_data = buffer.getvalue() -# Load trained model + # Upload to MinIO + minio_client.put_object( + bucket_name="training-models", + object_name=object_name, + data=model_data, + content_type="application/octet-stream" + ) + + # Return MinIO path + return f"minio://training-models/{object_name}" + +# Load trained model from MinIO def load_model_artifact(model_path: str) -> Prophet: - """Load serialized model""" - return joblib.load(model_path) + """Load serialized model from MinIO""" + import io + # Parse MinIO path: minio://bucket_name/object_path + _, bucket_and_path = model_path.split("://", 1) + bucket_name, object_name = bucket_and_path.split("/", 1) + + # Download from MinIO + model_data = minio_client.get_object(bucket_name, object_name) + + # Deserialize (joblib.load reads from file-like objects) + buffer = io.BytesIO(model_data) + return joblib.load(buffer) ``` ### Performance Metrics Calculation @@ -194,8 +216,8 @@ def calculate_performance_metrics(model: Prophet, actual_data: pd.DataFrame) -> - **Framework**: FastAPI (Python 3.11+) - Async web framework with WebSocket support - **Database**: PostgreSQL 17 - Training logs, model metadata, job queue - **ML Library**: Prophet (fbprophet) - Time series forecasting -- **Model Storage**: Joblib - Model serialization -- **File System**: Persistent volumes - Model artifact storage +- **Model Storage**: MinIO (S3-compatible) - Distributed object storage with TLS +- **Serialization**: Joblib - Model serialization - **WebSocket**: FastAPI WebSocket - Real-time progress updates - **Messaging**: RabbitMQ 4.1 - Training completion events - **ORM**: SQLAlchemy 2.0 (async) - Database abstraction @@ -442,7 +464,13 @@ websocket_messages_sent = Counter( - `PORT` - Service port (default: 8004) - `DATABASE_URL` - PostgreSQL connection string - `RABBITMQ_URL` - RabbitMQ connection string -- `MODEL_STORAGE_PATH` - Path for model artifacts (default: /models) + +**MinIO Configuration:** +- `MINIO_ENDPOINT` - MinIO server endpoint (default: minio.bakery-ia.svc.cluster.local:9000) +- `MINIO_ACCESS_KEY` - MinIO access key +- `MINIO_SECRET_KEY` - MinIO secret key +- `MINIO_USE_SSL` - Enable TLS (default: true) +- `MINIO_MODEL_BUCKET` - Bucket for models (default: training-models) **Training Configuration:** - `MAX_CONCURRENT_JOBS` - Maximum parallel training jobs (default: 3) @@ -462,10 +490,9 @@ websocket_messages_sent = Counter( - `WEBSOCKET_MAX_CONNECTIONS` - Max connections per tenant (default: 10) - `WEBSOCKET_MESSAGE_QUEUE_SIZE` - Message buffer size (default: 100) -**Storage Configuration:** -- `MODEL_RETENTION_DAYS` - Days to keep old models (default: 90) -- `MAX_MODEL_VERSIONS_PER_PRODUCT` - Version limit (default: 10) -- `ENABLE_MODEL_COMPRESSION` - Compress model files (default: true) +**Storage Configuration (MinIO):** +- `MINIO_MODEL_LIFECYCLE_DAYS` - Days to keep old model versions (default: 90) +- `MINIO_CACHE_TTL_SECONDS` - Model cache TTL in seconds (default: 3600) ## Development Setup @@ -473,7 +500,7 @@ websocket_messages_sent = Counter( - Python 3.11+ - PostgreSQL 17 - RabbitMQ 4.1 -- Persistent storage for model artifacts +- MinIO (S3-compatible object storage) ### Local Development ```bash @@ -488,10 +515,13 @@ pip install -r requirements.txt # Set environment variables export DATABASE_URL=postgresql://user:pass@localhost:5432/training export RABBITMQ_URL=amqp://guest:guest@localhost:5672/ -export MODEL_STORAGE_PATH=/tmp/models +export MINIO_ENDPOINT=localhost:9000 +export MINIO_ACCESS_KEY=minioadmin +export MINIO_SECRET_KEY=minioadmin +export MINIO_USE_SSL=false # Use true in production -# Create model storage directory -mkdir -p /tmp/models +# Start MinIO locally (if not using K8s) +docker run -p 9000:9000 -p 9001:9001 minio/minio server /data --console-address ":9001" # Run database migrations alembic upgrade head @@ -590,7 +620,7 @@ for feature_name in poi_features.keys(): - **External Service** - Fetch weather, traffic, holiday, and POI feature data - **PostgreSQL** - Store job queue, models, metrics, logs - **RabbitMQ** - Publish training completion events -- **File System** - Store model artifacts +- **MinIO** - Store model artifacts (S3-compatible object storage with TLS) ### Dependents (Services That Call This) - **Forecasting Service** - Load trained models for predictions @@ -627,11 +657,11 @@ for feature_name in poi_features.keys(): 4. **Resource Limits** - CPU/memory limits per training job 5. **Priority Queue** - Prioritize important products first -### Storage Optimization -1. **Model Compression** - Compress model artifacts (gzip) -2. **Old Model Cleanup** - Automatic deletion after retention period -3. **Version Limits** - Keep only N most recent versions -4. **Deduplication** - Avoid storing identical models +### Storage Optimization (MinIO) +1. **Object Versioning** - MinIO maintains version history automatically +2. **Lifecycle Policies** - Auto-cleanup old versions after 90 days +3. **TLS Encryption** - Secure communication with MinIO +4. **Distributed Storage** - MinIO handles replication and availability ### WebSocket Optimization 1. **Message Batching** - Batch progress updates (every 2 seconds) diff --git a/services/training/app/api/health.py b/services/training/app/api/health.py index 94d9652b..d7862259 100644 --- a/services/training/app/api/health.py +++ b/services/training/app/api/health.py @@ -96,48 +96,48 @@ def check_system_resources() -> Dict[str, Any]: def check_model_storage() -> Dict[str, Any]: - """Check model storage health""" + """Check MinIO model storage health""" try: - storage_path = settings.MODEL_STORAGE_PATH + from shared.clients.minio_client import minio_client - if not os.path.exists(storage_path): + # Check MinIO connectivity + if not minio_client.health_check(): return { - "status": "warning", - "message": f"Model storage path does not exist: {storage_path}" + "status": "unhealthy", + "message": "MinIO service is not reachable", + "storage_type": "minio" } - # Check if writable - test_file = os.path.join(storage_path, ".health_check") - try: - with open(test_file, 'w') as f: - f.write("test") - os.remove(test_file) - writable = True - except Exception: - writable = False + bucket_name = settings.MINIO_MODEL_BUCKET - # Count model files - model_files = 0 - total_size = 0 - for root, dirs, files in os.walk(storage_path): - for file in files: - if file.endswith('.pkl'): - model_files += 1 - file_path = os.path.join(root, file) - total_size += os.path.getsize(file_path) + # Check if bucket exists + bucket_exists = minio_client.bucket_exists(bucket_name) + if not bucket_exists: + return { + "status": "warning", + "message": f"MinIO bucket does not exist: {bucket_name}", + "storage_type": "minio" + } + + # Count model files in MinIO + model_objects = minio_client.list_objects(bucket_name, prefix="models/") + model_files = [obj for obj in model_objects if obj.endswith('.pkl')] return { - "status": "healthy" if writable else "degraded", - "path": storage_path, - "writable": writable, - "model_files": model_files, - "total_size_mb": round(total_size / 1024 / 1024, 2) + "status": "healthy", + "storage_type": "minio", + "endpoint": settings.MINIO_ENDPOINT, + "bucket": bucket_name, + "use_ssl": settings.MINIO_USE_SSL, + "model_files": len(model_files), + "bucket_exists": bucket_exists } except Exception as e: - logger.error(f"Model storage check failed: {e}") + logger.error(f"MinIO storage check failed: {e}") return { "status": "error", + "storage_type": "minio", "error": str(e) } diff --git a/services/training/app/api/models.py b/services/training/app/api/models.py index ae7c750f..a07e0378 100644 --- a/services/training/app/api/models.py +++ b/services/training/app/api/models.py @@ -14,7 +14,6 @@ from app.services.training_service import EnhancedTrainingService from datetime import datetime, timezone from sqlalchemy import select, delete, func import uuid -import shutil from shared.auth.decorators import ( get_current_user_dep, @@ -304,10 +303,9 @@ async def delete_tenant_models_complete( "jobs_cancelled": 0, "models_deleted": 0, "artifacts_deleted": 0, - "artifacts_files_deleted": 0, + "minio_objects_deleted": 0, "training_logs_deleted": 0, "performance_metrics_deleted": 0, - "storage_freed_bytes": 0, "errors": [] } @@ -336,51 +334,35 @@ async def delete_tenant_models_complete( deletion_stats["errors"].append(error_msg) logger.error(error_msg) - # Step 2: Delete model artifact files from storage + # Step 2: Delete model artifact files from MinIO storage try: - artifacts_query = select(ModelArtifact).where( - ModelArtifact.tenant_id == tenant_uuid - ) - artifacts_result = await db.execute(artifacts_query) - artifacts = artifacts_result.scalars().all() - - storage_freed = 0 + from shared.clients.minio_client import minio_client + + bucket_name = settings.MINIO_MODEL_BUCKET + prefix = f"models/{tenant_id}/" + + # List all objects for this tenant + objects_to_delete = minio_client.list_objects(bucket_name, prefix=prefix) + files_deleted = 0 - - for artifact in artifacts: + for obj_name in objects_to_delete: try: - file_path = Path(artifact.file_path) - if file_path.exists(): - file_size = file_path.stat().st_size - file_path.unlink() # Delete file - storage_freed += file_size - files_deleted += 1 - logger.debug("Deleted artifact file", - file_path=str(file_path), - size_bytes=file_size) - - # Also try to delete parent directories if empty - try: - if file_path.parent.exists() and not any(file_path.parent.iterdir()): - file_path.parent.rmdir() - except: - pass # Ignore errors cleaning up directories - + minio_client.delete_object(bucket_name, obj_name) + files_deleted += 1 + logger.debug("Deleted MinIO object", object_name=obj_name) except Exception as e: - error_msg = f"Error deleting artifact file {artifact.file_path}: {str(e)}" + error_msg = f"Error deleting MinIO object {obj_name}: {str(e)}" deletion_stats["errors"].append(error_msg) logger.warning(error_msg) - - deletion_stats["artifacts_files_deleted"] = files_deleted - deletion_stats["storage_freed_bytes"] = storage_freed - - logger.info("Deleted artifact files", + + deletion_stats["minio_objects_deleted"] = files_deleted + + logger.info("Deleted MinIO objects", tenant_id=tenant_id, - files_deleted=files_deleted, - storage_freed_mb=storage_freed / (1024 * 1024)) - + files_deleted=files_deleted) + except Exception as e: - error_msg = f"Error processing artifact files: {str(e)}" + error_msg = f"Error processing MinIO objects: {str(e)}" deletion_stats["errors"].append(error_msg) logger.error(error_msg) @@ -463,19 +445,7 @@ async def delete_tenant_models_complete( detail=error_msg ) - # Step 4: Clean up tenant model directory - try: - tenant_model_dir = Path(settings.MODEL_STORAGE_PATH) / tenant_id - if tenant_model_dir.exists(): - shutil.rmtree(tenant_model_dir) - logger.info("Deleted tenant model directory", - directory=str(tenant_model_dir)) - except Exception as e: - error_msg = f"Error deleting model directory: {str(e)}" - deletion_stats["errors"].append(error_msg) - logger.warning(error_msg) - - # Models deleted successfully + # Step 4: Models deleted successfully (MinIO cleanup already done in Step 2) return { "success": True, "message": f"All training data for tenant {tenant_id} deleted successfully", diff --git a/services/training/app/core/config.py b/services/training/app/core/config.py index 785fa351..6fc1d718 100644 --- a/services/training/app/core/config.py +++ b/services/training/app/core/config.py @@ -44,6 +44,18 @@ class TrainingSettings(BaseServiceSettings): MODEL_BACKUP_ENABLED: bool = os.getenv("MODEL_BACKUP_ENABLED", "true").lower() == "true" MODEL_VERSIONING_ENABLED: bool = os.getenv("MODEL_VERSIONING_ENABLED", "true").lower() == "true" + # MinIO Configuration + MINIO_ENDPOINT: str = os.getenv("MINIO_ENDPOINT", "minio.bakery-ia.svc.cluster.local:9000") + MINIO_ACCESS_KEY: str = os.getenv("MINIO_ACCESS_KEY", "training-service") + MINIO_SECRET_KEY: str = os.getenv("MINIO_SECRET_KEY", "training-secret-key") + MINIO_USE_SSL: bool = os.getenv("MINIO_USE_SSL", "true").lower() == "true" + MINIO_MODEL_BUCKET: str = os.getenv("MINIO_MODEL_BUCKET", "training-models") + MINIO_CONSOLE_PORT: str = os.getenv("MINIO_CONSOLE_PORT", "9001") + MINIO_API_PORT: str = os.getenv("MINIO_API_PORT", "9000") + MINIO_REGION: str = os.getenv("MINIO_REGION", "us-east-1") + MINIO_MODEL_LIFECYCLE_DAYS: int = int(os.getenv("MINIO_MODEL_LIFECYCLE_DAYS", "90")) + MINIO_CACHE_TTL_SECONDS: int = int(os.getenv("MINIO_CACHE_TTL_SECONDS", "3600")) + # Training Configuration MAX_CONCURRENT_TRAINING_JOBS: int = int(os.getenv("MAX_CONCURRENT_TRAINING_JOBS", "3")) diff --git a/services/training/app/ml/hybrid_trainer.py b/services/training/app/ml/hybrid_trainer.py index 7f1c068a..722e8486 100644 --- a/services/training/app/ml/hybrid_trainer.py +++ b/services/training/app/ml/hybrid_trainer.py @@ -5,6 +5,7 @@ Combines Prophet's seasonality modeling with XGBoost's pattern learning import pandas as pd import numpy as np +import io from typing import Dict, List, Any, Optional, Tuple import structlog from datetime import datetime, timezone @@ -110,8 +111,8 @@ class HybridProphetXGBoost: # Step 4: Get Prophet predictions on training data logger.info("Step 3: Generating Prophet predictions for residual calculation") - train_prophet_pred = self._get_prophet_predictions(prophet_result, train_df) - val_prophet_pred = self._get_prophet_predictions(prophet_result, val_df) + train_prophet_pred = await self._get_prophet_predictions(prophet_result, train_df) + val_prophet_pred = await self._get_prophet_predictions(prophet_result, val_df) # Step 5: Calculate residuals (actual - prophet_prediction) train_residuals = train_df['y'].values - train_prophet_pred @@ -207,7 +208,7 @@ class HybridProphetXGBoost: return df_enhanced - def _get_prophet_predictions( + async def _get_prophet_predictions( self, prophet_result: Dict[str, Any], df: pd.DataFrame @@ -230,8 +231,13 @@ class HybridProphetXGBoost: # Load the actual Prophet model from the stored path try: - import joblib - prophet_model = joblib.load(model_path) + if model_path.startswith("minio://"): + # Use prophet_manager to load from MinIO + prophet_model = await self.prophet_manager._load_model_from_minio(model_path) + else: + # Fallback to direct loading for local paths + import joblib + prophet_model = joblib.load(model_path) except Exception as e: raise ValueError(f"Failed to load Prophet model from path {model_path}: {str(e)}") @@ -417,8 +423,13 @@ class HybridProphetXGBoost: # Load the Prophet model from the stored path try: - import joblib - prophet_model = joblib.load(prophet_model_path) + if prophet_model_path.startswith("minio://"): + # Use prophet_manager to load from MinIO + prophet_model = await self.prophet_manager._load_model_from_minio(prophet_model_path) + else: + # Fallback to direct loading for local paths + import joblib + prophet_model = joblib.load(prophet_model_path) except Exception as e: raise ValueError(f"Failed to load Prophet model from path {prophet_model_path}: {str(e)}") diff --git a/services/training/app/ml/prophet_manager.py b/services/training/app/ml/prophet_manager.py index aa6dcbd2..1f2bae11 100644 --- a/services/training/app/ml/prophet_manager.py +++ b/services/training/app/ml/prophet_manager.py @@ -13,6 +13,7 @@ from datetime import datetime, timedelta import uuid import os import joblib +import io from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.model_selection import TimeSeriesSplit import json @@ -85,9 +86,24 @@ class BakeryProphetManager: self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "training-service") self.db_session = None # Will be set when session is available - # Ensure model storage directory exists - os.makedirs(settings.MODEL_STORAGE_PATH, exist_ok=True) - + # Initialize MinIO client and ensure bucket exists + from shared.clients.minio_client import minio_client + self.minio_client = minio_client + self._ensure_minio_bucket() + + def _ensure_minio_bucket(self): + """Ensure the training-models bucket exists in MinIO""" + try: + bucket_name = settings.MINIO_MODEL_BUCKET + if not self.minio_client.bucket_exists(bucket_name): + self.minio_client.create_bucket(bucket_name) + logger.info(f"Created MinIO bucket: {bucket_name}") + else: + logger.debug(f"MinIO bucket already exists: {bucket_name}") + except Exception as e: + logger.error(f"Failed to ensure MinIO bucket exists: {e}") + # Don't raise - bucket might be created by init job + async def train_bakery_model(self, tenant_id: str, inventory_product_id: str, @@ -706,18 +722,40 @@ class BakeryProphetManager: session = None) -> str: """Store model with database integration""" - # Create model directory - model_dir = Path(settings.MODEL_STORAGE_PATH) / tenant_id - model_dir.mkdir(parents=True, exist_ok=True) + # Store model in MinIO (clean implementation - MinIO only) + # Use BytesIO buffer since joblib.dump() writes to file-like objects + buffer = io.BytesIO() + joblib.dump(model, buffer) + model_data = buffer.getvalue() + object_name = f"models/{tenant_id}/{inventory_product_id}/{model_id}.pkl" + + # Use MinIO client + from shared.clients.minio_client import minio_client + + # Upload model to MinIO + success = minio_client.put_object( + bucket_name="training-models", + object_name=object_name, + data=model_data, + content_type="application/octet-stream", + metadata={ + "model_id": model_id, + "tenant_id": tenant_id, + "inventory_product_id": inventory_product_id, + "model_type": "prophet_optimized" + } + ) + + if not success: + raise Exception("Failed to upload model to MinIO") + + # Return MinIO object path + model_path = f"minio://training-models/{object_name}" + + # Calculate checksum for model data + import hashlib + model_checksum = hashlib.sha256(model_data).hexdigest() - # Store model file - model_path = model_dir / f"{model_id}.pkl" - joblib.dump(model, model_path) - - # Calculate checksum for model file integrity - checksummed_file = ChecksummedFile(str(model_path)) - model_checksum = checksummed_file.calculate_and_save_checksum() - # Enhanced metadata with checksum metadata = { "model_id": model_id, @@ -733,14 +771,23 @@ class BakeryProphetManager: "optimized_parameters": optimized_params or {}, "created_at": datetime.now().isoformat(), "model_type": "prophet_optimized", - "file_path": str(model_path), + "minio_path": model_path, "checksum": model_checksum, "checksum_algorithm": "sha256" } + + # Store metadata in MinIO as well + metadata_json = json.dumps(metadata, indent=2, default=str) + metadata_object_name = f"models/{tenant_id}/{inventory_product_id}/{model_id}.json" + minio_client.put_object( + bucket_name="training-models", + object_name=metadata_object_name, + data=metadata_json, + content_type="application/json" + ) - metadata_path = model_path.with_suffix('.json') - with open(metadata_path, 'w') as f: - json.dump(metadata, f, indent=2, default=str) + # Define metadata_path for database record + metadata_path = f"minio://training-models/{metadata_object_name}" # Store in memory model_key = f"{tenant_id}:{inventory_product_id}" @@ -854,16 +901,10 @@ class BakeryProphetManager: model_path: str, future_dates: pd.DataFrame, regressor_columns: List[str]) -> pd.DataFrame: - """Generate forecast using stored model with checksum verification""" + """Generate forecast using stored model from MinIO""" try: - # Verify model file integrity before loading - checksummed_file = ChecksummedFile(model_path) - if not checksummed_file.load_and_verify_checksum(): - logger.warning(f"Checksum verification failed for model: {model_path}") - # Still load the model but log warning - # In production, you might want to raise an exception instead - - model = joblib.load(model_path) + # Load model from MinIO + model = await self._load_model_from_minio(model_path) for regressor in regressor_columns: if regressor not in future_dates.columns: @@ -876,6 +917,33 @@ class BakeryProphetManager: except Exception as e: logger.error(f"Failed to generate forecast: {str(e)}") raise + + async def _load_model_from_minio(self, model_path: str): + """Load model from MinIO storage""" + try: + # Parse MinIO path: minio://bucket_name/object_path + if not model_path.startswith("minio://"): + raise ValueError(f"Invalid MinIO path: {model_path}") + + _, bucket_and_path = model_path.split("://", 1) + bucket_name, object_name = bucket_and_path.split("/", 1) + + logger.debug(f"Loading model from MinIO: {bucket_name}/{object_name}") + + # Download model data from MinIO + model_data = self.minio_client.get_object(bucket_name, object_name) + if not model_data: + raise ValueError(f"Failed to download model from MinIO: {model_path}") + + # Deserialize model (using BytesIO since joblib.load reads from file-like objects) + buffer = io.BytesIO(model_data) + model = joblib.load(buffer) + logger.info(f"Model loaded successfully from MinIO: {model_path}") + return model + + except Exception as e: + logger.error(f"Failed to load model from MinIO: {model_path}, error: {e}") + raise async def _validate_training_data(self, df: pd.DataFrame, inventory_product_id: str): """Validate training data quality (unchanged)""" diff --git a/services/training/requirements.txt b/services/training/requirements.txt index 3dd35d54..b0d0c677 100644 --- a/services/training/requirements.txt +++ b/services/training/requirements.txt @@ -17,6 +17,7 @@ scikit-learn==1.6.1 pandas==2.2.3 numpy==2.2.2 joblib==1.4.2 +minio==7.2.2 xgboost==2.1.3 # HTTP client diff --git a/shared/clients/minio_client.py b/shared/clients/minio_client.py new file mode 100644 index 00000000..911509fa --- /dev/null +++ b/shared/clients/minio_client.py @@ -0,0 +1,418 @@ +""" +MinIO Client Library +Shared client for MinIO object storage operations with TLS support +""" + +import os +import io +import ssl +import time +import urllib3 +from typing import Optional, Dict, Any, Union +from pathlib import Path +from functools import wraps + +from minio import Minio +from minio.error import S3Error +import structlog + +# Configure logger +logger = structlog.get_logger() + + +def with_retry(max_retries: int = 3, base_delay: float = 1.0, max_delay: float = 30.0): + """Decorator for retrying operations with exponential backoff + + Args: + max_retries: Maximum number of retry attempts + base_delay: Initial delay between retries in seconds + max_delay: Maximum delay between retries in seconds + """ + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + for attempt in range(max_retries + 1): + try: + return func(*args, **kwargs) + except (S3Error, urllib3.exceptions.HTTPError, ConnectionError, TimeoutError) as e: + last_exception = e + if attempt < max_retries: + # Exponential backoff with jitter + delay = min(base_delay * (2 ** attempt), max_delay) + logger.warning( + f"MinIO operation failed, retrying in {delay:.1f}s", + attempt=attempt + 1, + max_retries=max_retries, + error=str(e) + ) + time.sleep(delay) + else: + logger.error( + "MinIO operation failed after all retries", + attempts=max_retries + 1, + error=str(e) + ) + raise last_exception + return wrapper + return decorator + + +class MinIOClient: + """Client for MinIO object storage operations with TLS support""" + + def __init__(self): + """Initialize MinIO client with configuration""" + self._client = None + self._initialize_client() + + def _initialize_client(self) -> None: + """Initialize MinIO client from environment variables with SSL/TLS support""" + try: + # Get configuration from environment + endpoint = os.getenv("MINIO_ENDPOINT", "minio.bakery-ia.svc.cluster.local:9000") + access_key = os.getenv("MINIO_ACCESS_KEY", os.getenv("MINIO_ROOT_USER", "admin")) + secret_key = os.getenv("MINIO_SECRET_KEY", os.getenv("MINIO_ROOT_PASSWORD", "secure-password")) + use_ssl = os.getenv("MINIO_USE_SSL", "true").lower() == "true" + + # TLS certificate paths (optional - for cert verification) + ca_cert_path = os.getenv("MINIO_CA_CERT_PATH", "/etc/ssl/certs/minio-ca.crt") + # SSL verification is disabled by default for internal cluster with self-signed certs + # Set MINIO_VERIFY_SSL=true and provide CA cert path for production with proper certs + verify_ssl = os.getenv("MINIO_VERIFY_SSL", "false").lower() == "true" + + # Try to get settings from service configuration if available + try: + from app.core.config import settings + if hasattr(settings, 'MINIO_ENDPOINT'): + endpoint = settings.MINIO_ENDPOINT + access_key = settings.MINIO_ACCESS_KEY + secret_key = settings.MINIO_SECRET_KEY + use_ssl = settings.MINIO_USE_SSL + except ImportError: + # Fallback to environment variables (for shared client usage) + pass + + # Configure HTTP client with TLS settings + http_client = None + if use_ssl: + # Create custom HTTP client for TLS + if verify_ssl and os.path.exists(ca_cert_path): + # Verify certificates against CA + http_client = urllib3.PoolManager( + timeout=urllib3.Timeout(connect=10.0, read=60.0), + maxsize=10, + cert_reqs='CERT_REQUIRED', + ca_certs=ca_cert_path, + retries=urllib3.Retry( + total=5, + backoff_factor=0.2, + status_forcelist=[500, 502, 503, 504] + ) + ) + logger.info("MinIO TLS with certificate verification enabled", + ca_cert_path=ca_cert_path) + else: + # TLS without certificate verification (for self-signed certs in internal cluster) + # Still encrypted, just skips cert validation + http_client = urllib3.PoolManager( + timeout=urllib3.Timeout(connect=10.0, read=60.0), + maxsize=10, + cert_reqs='CERT_NONE', + retries=urllib3.Retry( + total=5, + backoff_factor=0.2, + status_forcelist=[500, 502, 503, 504] + ) + ) + # Suppress insecure request warnings for internal cluster + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + logger.info("MinIO TLS enabled without certificate verification (internal cluster)") + + # Initialize client with SSL/TLS + self._client = Minio( + endpoint, + access_key=access_key, + secret_key=secret_key, + secure=use_ssl, + http_client=http_client + ) + + logger.info("MinIO client initialized successfully", + endpoint=endpoint, + use_ssl=use_ssl, + verify_ssl=verify_ssl if use_ssl else False) + + except Exception as e: + logger.error("Failed to initialize MinIO client", error=str(e)) + raise + + def reconnect(self) -> bool: + """Reconnect to MinIO server + + Useful when connection is lost or credentials have changed. + + Returns: + True if reconnection succeeded, False otherwise + """ + try: + logger.info("Attempting to reconnect to MinIO...") + self._initialize_client() + return True + except Exception as e: + logger.error("Failed to reconnect to MinIO", error=str(e)) + return False + + @with_retry(max_retries=3, base_delay=1.0) + def bucket_exists(self, bucket_name: str) -> bool: + """Check if bucket exists - handles limited permissions gracefully""" + try: + # First try the standard method + return self._client.bucket_exists(bucket_name) + except S3Error as e: + # If we get AccessDenied, try alternative method for limited-permission users + if e.code == "AccessDenied": + logger.debug("Access denied for bucket_exists, trying alternative method", + bucket_name=bucket_name) + try: + # Try to list objects - this works with ListBucket permission + # If bucket doesn't exist, this will raise NoSuchBucket error + # If bucket exists but user has no permission, this will raise AccessDenied + objects = list(self._client.list_objects(bucket_name, recursive=False)) + logger.debug("Bucket exists (verified via list_objects)", bucket_name=bucket_name) + return True + except S3Error as list_error: + if list_error.code == "NoSuchBucket": + logger.debug("Bucket does not exist", bucket_name=bucket_name) + return False + else: + logger.error("Failed to check bucket existence (alternative method)", + bucket_name=bucket_name, + error=str(list_error)) + return False + else: + logger.error("Failed to check bucket existence", + bucket_name=bucket_name, + error=str(e)) + return False + + def create_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool: + """Create a new bucket if it doesn't exist""" + try: + if not self.bucket_exists(bucket_name): + self._client.make_bucket(bucket_name, region) + logger.info("Created MinIO bucket", bucket_name=bucket_name) + return True + return False + except S3Error as e: + logger.error("Failed to create bucket", + bucket_name=bucket_name, + error=str(e)) + return False + + @with_retry(max_retries=3, base_delay=1.0) + def put_object( + self, + bucket_name: str, + object_name: str, + data: Union[bytes, io.BytesIO, str, Path], + length: Optional[int] = None, + content_type: str = "application/octet-stream", + metadata: Optional[Dict[str, str]] = None + ) -> bool: + """Upload an object to MinIO + + Args: + bucket_name: Target bucket name + object_name: Object key/path in the bucket + data: Data to upload (bytes, BytesIO, string, or Path) + length: Optional data length (calculated automatically if not provided) + content_type: MIME type of the object + metadata: Optional metadata dictionary + + Returns: + True if upload succeeded, False otherwise + """ + try: + # Ensure bucket exists + self.create_bucket(bucket_name) + + # Convert data to bytes if needed + if isinstance(data, str): + data = data.encode('utf-8') + elif isinstance(data, Path): + with open(data, 'rb') as f: + data = f.read() + elif isinstance(data, io.BytesIO): + data = data.getvalue() + + # Calculate length if not provided + data_length = length if length is not None else len(data) + + # MinIO SDK requires BytesIO stream and explicit length + data_stream = io.BytesIO(data) + + # Upload object with proper stream and length + self._client.put_object( + bucket_name, + object_name, + data_stream, + length=data_length, + content_type=content_type, + metadata=metadata + ) + + logger.info("Uploaded object to MinIO", + bucket_name=bucket_name, + object_name=object_name, + size=data_length) + + return True + + except S3Error as e: + logger.error("Failed to upload object", + bucket_name=bucket_name, + object_name=object_name, + error=str(e)) + return False + + @with_retry(max_retries=3, base_delay=1.0) + def get_object(self, bucket_name: str, object_name: str) -> Optional[bytes]: + """Download an object from MinIO""" + try: + # Get object data + response = self._client.get_object(bucket_name, object_name) + data = response.read() + + logger.info("Downloaded object from MinIO", + bucket_name=bucket_name, + object_name=object_name, + size=len(data)) + + return data + + except S3Error as e: + logger.error("Failed to download object", + bucket_name=bucket_name, + object_name=object_name, + error=str(e)) + return None + + def object_exists(self, bucket_name: str, object_name: str) -> bool: + """Check if object exists""" + try: + self._client.stat_object(bucket_name, object_name) + return True + except S3Error: + return False + + def list_objects(self, bucket_name: str, prefix: str = "") -> list: + """List objects in bucket with optional prefix""" + try: + objects = self._client.list_objects(bucket_name, prefix=prefix, recursive=True) + return [obj.object_name for obj in objects] + except S3Error as e: + logger.error("Failed to list objects", + bucket_name=bucket_name, + prefix=prefix, + error=str(e)) + return [] + + def delete_object(self, bucket_name: str, object_name: str) -> bool: + """Delete an object from MinIO""" + try: + self._client.remove_object(bucket_name, object_name) + logger.info("Deleted object from MinIO", + bucket_name=bucket_name, + object_name=object_name) + return True + except S3Error as e: + logger.error("Failed to delete object", + bucket_name=bucket_name, + object_name=object_name, + error=str(e)) + return False + + def get_presigned_url( + self, + bucket_name: str, + object_name: str, + expires: int = 3600 + ) -> Optional[str]: + """Generate presigned URL for object access""" + try: + url = self._client.presigned_get_object( + bucket_name, + object_name, + expires=expires + ) + return url + except S3Error as e: + logger.error("Failed to generate presigned URL", + bucket_name=bucket_name, + object_name=object_name, + error=str(e)) + return None + + def copy_object( + self, + source_bucket: str, + source_object: str, + dest_bucket: str, + dest_object: str + ) -> bool: + """Copy object within MinIO""" + try: + # Ensure destination bucket exists + self.create_bucket(dest_bucket) + + # Copy object + self._client.copy_object(dest_bucket, dest_object, + f"{source_bucket}/{source_object}") + + logger.info("Copied object in MinIO", + source_bucket=source_bucket, + source_object=source_object, + dest_bucket=dest_bucket, + dest_object=dest_object) + + return True + except S3Error as e: + logger.error("Failed to copy object", + source_bucket=source_bucket, + source_object=source_object, + dest_bucket=dest_bucket, + dest_object=dest_object, + error=str(e)) + return False + + def get_object_metadata(self, bucket_name: str, object_name: str) -> Optional[Dict[str, Any]]: + """Get object metadata""" + try: + stat = self._client.stat_object(bucket_name, object_name) + return { + "size": stat.size, + "last_modified": stat.last_modified, + "content_type": stat.content_type, + "metadata": stat.metadata or {} + } + except S3Error as e: + logger.error("Failed to get object metadata", + bucket_name=bucket_name, + object_name=object_name, + error=str(e)) + return None + + def health_check(self) -> bool: + """Check MinIO service health""" + try: + # Simple bucket list to check connectivity + self._client.list_buckets() + return True + except Exception as e: + logger.error("MinIO health check failed", error=str(e)) + return False + + +# Singleton instance for convenience +minio_client = MinIOClient() \ No newline at end of file diff --git a/shared/config/base.py b/shared/config/base.py index b0f1c83f..0a7e1bc1 100755 --- a/shared/config/base.py +++ b/shared/config/base.py @@ -315,10 +315,9 @@ class BaseServiceSettings(BaseSettings): # ================================================================ # ML & AI CONFIGURATION # ================================================================ - - # Model Storage - MODEL_STORAGE_PATH: str = os.getenv("MODEL_STORAGE_PATH", "/app/models") - MODEL_STORAGE_BACKEND: str = os.getenv("MODEL_STORAGE_BACKEND", "local") # local, s3, gcs + + # Model Storage Backend (MinIO is the primary storage) + MODEL_STORAGE_BACKEND: str = os.getenv("MODEL_STORAGE_BACKEND", "minio") # Training Configuration MAX_TRAINING_TIME_MINUTES: int = int(os.getenv("MAX_TRAINING_TIME_MINUTES", "30")) diff --git a/shared/monitoring/metrics.py b/shared/monitoring/metrics.py index adffa1fe..74472ed5 100755 --- a/shared/monitoring/metrics.py +++ b/shared/monitoring/metrics.py @@ -308,6 +308,47 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector): return metrics_collector +def track_user_activity(user_id: str, action: str, service_name: str = "unknown-service", metadata: dict = None): + """Track user activity metrics using the appropriate metrics collector""" + if metadata is None: + metadata = {} + + # Add user-specific attributes + attributes = { + "user.id": user_id, + "action": action, + **metadata + } + + # Get the metrics collector for the specified service + metrics_collector = get_metrics_collector(service_name) + + if metrics_collector: + # Use the collector's counter registration system + counter_name = "user_activity_total" + + # Check if counter already exists, if not register it + if counter_name not in metrics_collector._counters: + metrics_collector.register_counter( + name=counter_name, + documentation="Total user activity events" + ) + + # Increment the counter with attributes + metrics_collector.increment_counter(counter_name, value=1, labels=attributes) + else: + # Fallback: create a temporary counter if no collector exists + from opentelemetry import metrics + + meter = metrics.get_meter(__name__) + user_activity_counter = meter.create_counter( + name="user_activity_total", + description="User activity events", + unit="events" + ) + user_activity_counter.add(1, attributes) + + def setup_metrics_early( app, service_name: str = None,