Imporve monitoring 2

This commit is contained in:
Urtzi Alfaro
2026-01-09 07:26:11 +01:00
parent 4af860c010
commit 8ca5d9c100
39 changed files with 1035 additions and 376 deletions

View File

@@ -402,7 +402,7 @@ local_resource(
echo "" echo ""
echo "📈 SigNoz Access Information:" echo "📈 SigNoz Access Information:"
echo " URL: https://monitoring.bakery-ia.local/signoz" echo " URL: https://monitoring.bakery-ia.local"
echo " Username: admin" echo " Username: admin"
echo " Password: admin" echo " Password: admin"
echo "" echo ""
@@ -445,7 +445,7 @@ local_resource(
if [ "$READY_PODS" -eq "$TOTAL_PODS" ]; then if [ "$READY_PODS" -eq "$TOTAL_PODS" ]; then
echo "✅ All SigNoz pods are running!" echo "✅ All SigNoz pods are running!"
echo "" echo ""
echo "Access SigNoz at: https://monitoring.bakery-ia.local/signoz" echo "Access SigNoz at: https://monitoring.bakery-ia.local"
echo "Credentials: admin / admin" echo "Credentials: admin / admin"
else else
echo "⏳ Waiting for pods to become ready..." echo "⏳ Waiting for pods to become ready..."
@@ -687,7 +687,7 @@ Access your application:
SigNoz (Unified Observability): SigNoz (Unified Observability):
Deploy via Tilt: Trigger 'signoz-deployment' resource Deploy via Tilt: Trigger 'signoz-deployment' resource
Manual deploy: ./infrastructure/helm/deploy-signoz.sh dev Manual deploy: ./infrastructure/helm/deploy-signoz.sh dev
Access (if deployed): https://monitoring.bakery-ia.local/signoz Access (if deployed): https://monitoring.bakery-ia.local
Username: admin Username: admin
Password: admin Password: admin

View File

@@ -162,7 +162,7 @@ data:
exporters: exporters:
# Send to SigNoz # Send to SigNoz
otlphttp: otlphttp:
endpoint: http://signoz-otel-collector.signoz.svc.cluster.local:4318 endpoint: http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318
tls: tls:
insecure: true insecure: true
@@ -374,7 +374,7 @@ processors:
exporters: exporters:
otlphttp/logs: otlphttp/logs:
endpoint: http://signoz-otel-collector.signoz.svc.cluster.local:4318/v1/logs endpoint: http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318/v1/logs
service: service:
pipelines: pipelines:

View File

@@ -316,8 +316,8 @@ spec:
#### Issue: No Metrics Appearing in SigNoz #### Issue: No Metrics Appearing in SigNoz
**Checklist:** **Checklist:**
- ✅ OpenTelemetry Collector running? `kubectl get pods -n signoz` - ✅ OpenTelemetry Collector running? `kubectl get pods -n bakery-ia -l app.kubernetes.io/instance=signoz`
- ✅ Service can reach collector? `telnet signoz-otel-collector.signoz 4318` - ✅ Service can reach collector? `telnet signoz-otel-collector.bakery-ia 4318`
- ✅ OTLP endpoint configured correctly? Check `OTEL_EXPORTER_OTLP_ENDPOINT` - ✅ OTLP endpoint configured correctly? Check `OTEL_EXPORTER_OTLP_ENDPOINT`
- ✅ Service logs show OTLP export? Look for "Exporting metrics" - ✅ Service logs show OTLP export? Look for "Exporting metrics"
- ✅ No network policies blocking? Check Kubernetes network policies - ✅ No network policies blocking? Check Kubernetes network policies
@@ -325,13 +325,13 @@ spec:
**Debugging:** **Debugging:**
```bash ```bash
# Check OpenTelemetry Collector logs # Check OpenTelemetry Collector logs
kubectl logs -n signoz -l app=otel-collector kubectl logs -n bakery-ia -l app=otel-collector
# Check service logs for OTLP errors # Check service logs for OTLP errors
kubectl logs -l app=auth-service | grep -i otel kubectl logs -l app=auth-service | grep -i otel
# Test OTLP connectivity from service pod # Test OTLP connectivity from service pod
kubectl exec -it auth-service-pod -- curl -v http://signoz-otel-collector.signoz:4318 kubectl exec -it auth-service-pod -- curl -v http://signoz-otel-collector.bakery-ia:4318
``` ```
#### Issue: High Latency in Specific Service #### Issue: High Latency in Specific Service
@@ -442,7 +442,7 @@ class MyService(StandardFastAPIService):
```env ```env
# OpenTelemetry Collector endpoint # OpenTelemetry Collector endpoint
OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector.signoz:4318 OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector.bakery-ia:4318
# Service-specific configuration # Service-specific configuration
OTEL_SERVICE_NAME=auth-service OTEL_SERVICE_NAME=auth-service
@@ -473,7 +473,7 @@ spec:
image: auth-service:latest image: auth-service:latest
env: env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz:4318" value: "http://signoz-otel-collector.bakery-ia:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "auth-service" value: "auth-service"
- name: ENVIRONMENT - name: ENVIRONMENT

View File

@@ -48,7 +48,7 @@ def setup_tracing(service_name: str = "gateway"):
# Configure OTLP exporter (sends to OpenTelemetry Collector) # Configure OTLP exporter (sends to OpenTelemetry Collector)
otlp_exporter = OTLPSpanExporter( otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True # Use insecure connection for internal cluster communication insecure=True # Use insecure connection for internal cluster communication
) )

View File

@@ -0,0 +1,554 @@
# SigNoz Helm Deployment for Bakery IA
This directory contains Helm configurations and deployment scripts for SigNoz observability platform.
## Overview
SigNoz is deployed using the official Helm chart with environment-specific configurations optimized for:
- **Development**: Colima + Kind (Kubernetes in Docker) with Tilt
- **Production**: VPS on clouding.io with MicroK8s
## Prerequisites
### Required Tools
- **kubectl** 1.22+
- **Helm** 3.8+
- **Docker** (for development)
- **Kind/MicroK8s** (environment-specific)
### Docker Hub Authentication
SigNoz uses images from Docker Hub. Set up authentication to avoid rate limits:
```bash
# Option 1: Environment variables (recommended)
export DOCKERHUB_USERNAME='your-username'
export DOCKERHUB_PASSWORD='your-personal-access-token'
# Option 2: Docker login
docker login
```
## Quick Start
### Development Deployment
```bash
# Deploy SigNoz to development environment
./deploy-signoz.sh dev
# Verify deployment
./verify-signoz.sh dev
# Access SigNoz UI
# Via ingress: http://monitoring.bakery-ia.local
# Or port-forward:
kubectl port-forward -n signoz svc/signoz 8080:8080
# Then open: http://localhost:8080
```
### Production Deployment
```bash
# Deploy SigNoz to production environment
./deploy-signoz.sh prod
# Verify deployment
./verify-signoz.sh prod
# Access SigNoz UI
# https://monitoring.bakewise.ai
```
## Configuration Files
### signoz-values-dev.yaml
Development environment configuration with:
- Single replica for most components
- Reduced resource requests (optimized for local Kind cluster)
- 7-day data retention
- Batch size: 10,000 events
- ClickHouse 25.5.6, OTel Collector v0.129.12
- PostgreSQL, Redis, and RabbitMQ receivers configured
### signoz-values-prod.yaml
Production environment configuration with:
- High availability: 2+ replicas for critical components
- 3 Zookeeper replicas (required for production)
- 30-day data retention
- Batch size: 50,000 events (high-performance)
- Cold storage enabled with 30-day TTL
- Horizontal Pod Autoscaler (HPA) enabled
- TLS/SSL with cert-manager
- Enhanced security with pod anti-affinity rules
## Key Configuration Changes (v0.89.0+)
⚠️ **BREAKING CHANGE**: SigNoz Helm chart v0.89.0+ uses a unified component structure.
**Old Structure (deprecated):**
```yaml
frontend:
replicaCount: 2
queryService:
replicaCount: 2
```
**New Structure (current):**
```yaml
signoz:
replicaCount: 2
# Combines frontend + query service
```
## Component Architecture
### Core Components
1. **SigNoz** (unified component)
- Frontend UI + Query Service
- Port 8080 (HTTP/API), 8085 (internal gRPC)
- Dev: 1 replica, Prod: 2+ replicas with HPA
2. **ClickHouse** (Time-series database)
- Version: 25.5.6
- Stores traces, metrics, and logs
- Dev: 1 replica, Prod: 2 replicas with cold storage
3. **Zookeeper** (ClickHouse coordination)
- Version: 3.7.1
- Dev: 1 replica, Prod: 3 replicas (critical for HA)
4. **OpenTelemetry Collector** (Data ingestion)
- Version: v0.129.12
- Ports: 4317 (gRPC), 4318 (HTTP), 8888 (metrics)
- Dev: 1 replica, Prod: 2+ replicas with HPA
5. **Alertmanager** (Alert management)
- Version: 0.23.5
- Email and Slack integrations configured
- Port: 9093
## Performance Optimizations
### Batch Processing
- **Development**: 10,000 events per batch
- **Production**: 50,000 events per batch (official recommendation)
- Timeout: 1 second for faster processing
### Memory Management
- Memory limiter processor prevents OOM
- Dev: 400 MiB limit, Prod: 1500 MiB limit
- Spike limits configured
### Span Metrics Processor
Automatically generates RED metrics (Rate, Errors, Duration):
- Latency histogram buckets optimized for microservices
- Cache size: 10K (dev), 100K (prod)
### Cold Storage (Production Only)
- Enabled with 30-day TTL
- Automatically moves old data to cold storage
- Keeps 10GB free on primary storage
## OpenTelemetry Endpoints
### From Within Kubernetes Cluster
**Development:**
```
OTLP gRPC: signoz-otel-collector.bakery-ia.svc.cluster.local:4317
OTLP HTTP: signoz-otel-collector.bakery-ia.svc.cluster.local:4318
```
**Production:**
```
OTLP gRPC: signoz-otel-collector.bakery-ia.svc.cluster.local:4317
OTLP HTTP: signoz-otel-collector.bakery-ia.svc.cluster.local:4318
```
### Application Configuration Example
```yaml
# Python with OpenTelemetry
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf"
```
```javascript
// Node.js with OpenTelemetry
const exporter = new OTLPTraceExporter({
url: 'http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318/v1/traces',
});
```
## Deployment Scripts
### deploy-signoz.sh
Comprehensive deployment script with features:
```bash
# Usage
./deploy-signoz.sh [OPTIONS] ENVIRONMENT
# Options
-h, --help Show help message
-d, --dry-run Show what would be deployed
-u, --upgrade Upgrade existing deployment
-r, --remove Remove deployment
-n, --namespace NS Custom namespace (default: signoz)
# Examples
./deploy-signoz.sh dev # Deploy to dev
./deploy-signoz.sh --upgrade prod # Upgrade prod
./deploy-signoz.sh --dry-run prod # Preview changes
./deploy-signoz.sh --remove dev # Remove dev deployment
```
**Features:**
- Automatic Helm repository setup
- Docker Hub secret creation
- Namespace management
- Deployment verification
- 15-minute timeout with `--wait` flag
### verify-signoz.sh
Verification script to check deployment health:
```bash
# Usage
./verify-signoz.sh [OPTIONS] ENVIRONMENT
# Examples
./verify-signoz.sh dev # Verify dev deployment
./verify-signoz.sh prod # Verify prod deployment
```
**Checks performed:**
1. ✅ Helm release status
2. ✅ Pod health and readiness
3. ✅ Service availability
4. ✅ Ingress configuration
5. ✅ PVC status
6. ✅ Resource usage (if metrics-server available)
7. ✅ Log errors
8. ✅ Environment-specific validations
- Dev: Single replica, resource limits
- Prod: HA config, TLS, Zookeeper replicas, HPA
## Storage Configuration
### Development (Kind)
```yaml
global:
storageClass: "standard" # Kind's default provisioner
```
### Production (MicroK8s)
```yaml
global:
storageClass: "microk8s-hostpath" # Or custom storage class
```
**Storage Requirements:**
- **Development**: ~35 GiB total
- SigNoz: 5 GiB
- ClickHouse: 20 GiB
- Zookeeper: 5 GiB
- Alertmanager: 2 GiB
- **Production**: ~135 GiB total
- SigNoz: 20 GiB
- ClickHouse: 100 GiB
- Zookeeper: 10 GiB
- Alertmanager: 5 GiB
## Resource Requirements
### Development Environment
**Minimum:**
- CPU: 550m (0.55 cores)
- Memory: 1.6 GiB
- Storage: 35 GiB
**Recommended:**
- CPU: 3 cores
- Memory: 3 GiB
- Storage: 50 GiB
### Production Environment
**Minimum:**
- CPU: 3.5 cores
- Memory: 8 GiB
- Storage: 135 GiB
**Recommended:**
- CPU: 12 cores
- Memory: 20 GiB
- Storage: 200 GiB
## Data Retention
### Development
- Traces: 7 days (168 hours)
- Metrics: 7 days (168 hours)
- Logs: 7 days (168 hours)
### Production
- Traces: 30 days (720 hours)
- Metrics: 30 days (720 hours)
- Logs: 30 days (720 hours)
- Cold storage after 30 days
To modify retention, update the environment variables:
```yaml
signoz:
env:
signoz_traces_ttl_duration_hrs: "720" # 30 days
signoz_metrics_ttl_duration_hrs: "720" # 30 days
signoz_logs_ttl_duration_hrs: "168" # 7 days
```
## High Availability (Production)
### Replication Strategy
```yaml
signoz: 2 replicas + HPA (min: 2, max: 5)
clickhouse: 2 replicas
zookeeper: 3 replicas (critical!)
otelCollector: 2 replicas + HPA (min: 2, max: 10)
alertmanager: 2 replicas
```
### Pod Anti-Affinity
Ensures pods are distributed across different nodes:
```yaml
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/component: query-service
topologyKey: kubernetes.io/hostname
```
### Pod Disruption Budgets
Configured for all critical components:
```yaml
podDisruptionBudget:
enabled: true
minAvailable: 1
```
## Monitoring and Alerting
### Email Alerts (Production)
Configure SMTP in production values:
```yaml
signoz:
env:
signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
signoz_smtp_port: "587"
signoz_smtp_from: "alerts@bakewise.ai"
signoz_smtp_username: "alerts@bakewise.ai"
# Set via secret: signoz_smtp_password
```
### Slack Alerts (Production)
Configure webhook in Alertmanager:
```yaml
alertmanager:
config:
receivers:
- name: 'critical-alerts'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts-critical'
```
### Self-Monitoring
SigNoz monitors itself:
```yaml
selfMonitoring:
enabled: true
serviceMonitor:
enabled: true # Prod only
interval: 30s
```
## Troubleshooting
### Common Issues
**1. Pods not starting**
```bash
# Check pod status
kubectl get pods -n signoz
# Check pod logs
kubectl logs -n signoz <pod-name>
# Describe pod for events
kubectl describe pod -n signoz <pod-name>
```
**2. Docker Hub rate limits**
```bash
# Verify secret exists
kubectl get secret dockerhub-creds -n signoz
# Recreate secret
kubectl delete secret dockerhub-creds -n signoz
export DOCKERHUB_USERNAME='your-username'
export DOCKERHUB_PASSWORD='your-token'
./deploy-signoz.sh dev
```
**3. ClickHouse connection issues**
```bash
# Check ClickHouse pod
kubectl logs -n signoz -l app.kubernetes.io/component=clickhouse
# Check Zookeeper (required by ClickHouse)
kubectl logs -n signoz -l app.kubernetes.io/component=zookeeper
```
**4. OTel Collector not receiving data**
```bash
# Check OTel Collector logs
kubectl logs -n signoz -l app.kubernetes.io/component=otel-collector
# Test connectivity
kubectl port-forward -n signoz svc/signoz-otel-collector 4318:4318
curl -v http://localhost:4318/v1/traces
```
**5. Insufficient storage**
```bash
# Check PVC status
kubectl get pvc -n signoz
# Check storage usage (if metrics-server available)
kubectl top pods -n signoz
```
### Debug Mode
Enable debug exporter in OTel Collector:
```yaml
otelCollector:
config:
exporters:
debug:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
service:
pipelines:
traces:
exporters: [clickhousetraces, debug] # Add debug
```
### Upgrade from Old Version
If upgrading from pre-v0.89.0:
```bash
# 1. Backup data (recommended)
kubectl get all -n signoz -o yaml > signoz-backup.yaml
# 2. Remove old deployment
./deploy-signoz.sh --remove prod
# 3. Deploy new version
./deploy-signoz.sh prod
# 4. Verify
./verify-signoz.sh prod
```
## Security Best Practices
1. **Change default password** immediately after first login
2. **Use TLS/SSL** in production (configured with cert-manager)
3. **Network policies** enabled in production
4. **Run as non-root** (configured in securityContext)
5. **RBAC** with dedicated service account
6. **Secrets management** for sensitive data (SMTP, Slack webhooks)
7. **Image pull secrets** to avoid exposing Docker Hub credentials
## Backup and Recovery
### Backup ClickHouse Data
```bash
# Export ClickHouse data
kubectl exec -n signoz <clickhouse-pod> -- clickhouse-client \
--query="BACKUP DATABASE signoz_traces TO Disk('backups', 'traces_backup.zip')"
# Copy backup out
kubectl cp signoz/<clickhouse-pod>:/var/lib/clickhouse/backups/ ./backups/
```
### Restore from Backup
```bash
# Copy backup in
kubectl cp ./backups/ signoz/<clickhouse-pod>:/var/lib/clickhouse/backups/
# Restore
kubectl exec -n signoz <clickhouse-pod> -- clickhouse-client \
--query="RESTORE DATABASE signoz_traces FROM Disk('backups', 'traces_backup.zip')"
```
## Updating Configuration
To update SigNoz configuration:
1. Edit values file: `signoz-values-{env}.yaml`
2. Apply changes:
```bash
./deploy-signoz.sh --upgrade {env}
```
3. Verify:
```bash
./verify-signoz.sh {env}
```
## Uninstallation
```bash
# Remove SigNoz deployment
./deploy-signoz.sh --remove {env}
# Optionally delete PVCs (WARNING: deletes all data)
kubectl delete pvc -n signoz -l app.kubernetes.io/instance=signoz
# Optionally delete namespace
kubectl delete namespace signoz
```
## References
- [SigNoz Official Documentation](https://signoz.io/docs/)
- [SigNoz Helm Charts Repository](https://github.com/SigNoz/charts)
- [OpenTelemetry Documentation](https://opentelemetry.io/docs/)
- [ClickHouse Documentation](https://clickhouse.com/docs/)
## Support
For issues or questions:
1. Check [SigNoz GitHub Issues](https://github.com/SigNoz/signoz/issues)
2. Review deployment logs: `kubectl logs -n signoz <pod-name>`
3. Run verification script: `./verify-signoz.sh {env}`
4. Check [SigNoz Community Slack](https://signoz.io/slack)
---
**Last Updated**: 2026-01-09
**SigNoz Helm Chart Version**: Latest (v0.129.12 components)
**Maintained by**: Bakery IA Team

View File

@@ -30,7 +30,7 @@ show_help() {
-d, --dry-run Dry run - show what would be done without actually deploying -d, --dry-run Dry run - show what would be done without actually deploying
-u, --upgrade Upgrade existing deployment -u, --upgrade Upgrade existing deployment
-r, --remove Remove/Uninstall SigNoz deployment -r, --remove Remove/Uninstall SigNoz deployment
-n, --namespace NAMESPACE Specify namespace (default: signoz)" -n, --namespace NAMESPACE Specify namespace (default: bakery-ia)"
echo "" echo ""
echo "Examples: echo "Examples:
$0 dev # Deploy to development $0 dev # Deploy to development
@@ -51,7 +51,7 @@ show_help() {
DRY_RUN=false DRY_RUN=false
UPGRADE=false UPGRADE=false
REMOVE=false REMOVE=false
NAMESPACE="signoz" NAMESPACE="bakery-ia"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
@@ -208,92 +208,90 @@ create_dockerhub_secret() {
echo "" echo ""
} }
# Function to add and update Helm repository
setup_helm_repo() {
echo "${BLUE}Setting up SigNoz Helm repository...${NC}"
if [[ "$DRY_RUN" == true ]]; then
echo " (dry-run) Would add SigNoz Helm repository"
return
fi
# Add SigNoz Helm repository
if helm repo list | grep -q "^signoz"; then
echo "${BLUE}SigNoz repository already added, updating...${NC}"
helm repo update signoz
else
echo "${BLUE}Adding SigNoz Helm repository...${NC}"
helm repo add signoz https://charts.signoz.io
helm repo update
fi
echo "${GREEN}Helm repository ready.${NC}"
echo ""
}
# Function to deploy SigNoz # Function to deploy SigNoz
deploy_signoz() { deploy_signoz() {
local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml" local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml"
if [[ ! -f "$values_file" ]]; then if [[ ! -f "$values_file" ]]; then
echo "${RED}Error: Values file $values_file not found.${NC}" echo "${RED}Error: Values file $values_file not found.${NC}"
exit 1 exit 1
fi fi
echo "${BLUE}Deploying SigNoz to $ENVIRONMENT environment...${NC}" echo "${BLUE}Deploying SigNoz to $ENVIRONMENT environment...${NC}"
echo " Using values file: $values_file" echo " Using values file: $values_file"
echo " Target namespace: $NAMESPACE" echo " Target namespace: $NAMESPACE"
echo " Chart version: Latest from signoz/signoz"
if [[ "$DRY_RUN" == true ]]; then if [[ "$DRY_RUN" == true ]]; then
echo " (dry-run) Would deploy SigNoz with:" echo " (dry-run) Would deploy SigNoz with:"
echo " helm install signoz signoz/signoz -n $NAMESPACE -f $values_file" echo " helm upgrade --install signoz signoz/signoz -n $NAMESPACE -f $values_file --wait --timeout 15m"
return return
fi fi
# Use upgrade --install to handle both new installations and upgrades # Use upgrade --install to handle both new installations and upgrades
echo "${BLUE}Installing/Upgrading SigNoz...${NC}" echo "${BLUE}Installing/Upgrading SigNoz...${NC}"
helm upgrade --install signoz signoz/signoz -n "$NAMESPACE" -f "$values_file" echo "This may take 10-15 minutes..."
echo "${GREEN}SigNoz deployment initiated.${NC}" helm upgrade --install signoz signoz/signoz \
echo "Waiting for pods to become ready..." -n "$NAMESPACE" \
-f "$values_file" \
# Wait for deployment to complete --wait \
wait_for_deployment --timeout 15m \
--create-namespace
echo "${GREEN}SigNoz deployment completed.${NC}"
echo ""
# Show deployment status
show_deployment_status
} }
# Function to remove SigNoz # Function to remove SigNoz
remove_signoz() { remove_signoz() {
echo "${BLUE}Removing SigNoz deployment from namespace $NAMESPACE...${NC}" echo "${BLUE}Removing SigNoz deployment from namespace $NAMESPACE...${NC}"
if [[ "$DRY_RUN" == true ]]; then if [[ "$DRY_RUN" == true ]]; then
echo " (dry-run) Would remove SigNoz deployment" echo " (dry-run) Would remove SigNoz deployment"
return return
fi fi
if helm list -n "$NAMESPACE" | grep -q signoz; then if helm list -n "$NAMESPACE" | grep -q signoz; then
helm uninstall signoz -n "$NAMESPACE" helm uninstall signoz -n "$NAMESPACE" --wait
echo "${GREEN}SigNoz deployment removed.${NC}" echo "${GREEN}SigNoz deployment removed.${NC}"
# Optionally remove PVCs (commented out by default for safety)
echo ""
echo "${YELLOW}Note: Persistent Volume Claims (PVCs) were NOT deleted.${NC}"
echo "To delete PVCs and all data, run:"
echo " kubectl delete pvc -n $NAMESPACE -l app.kubernetes.io/instance=signoz"
else else
echo "${YELLOW}No SigNoz deployment found in namespace $NAMESPACE.${NC}" echo "${YELLOW}No SigNoz deployment found in namespace $NAMESPACE.${NC}"
fi fi
} }
# Function to wait for deployment to complete
wait_for_deployment() {
echo "${BLUE}Waiting for SigNoz pods to become ready...${NC}"
# Wait for pods to be ready
local timeout=600 # 10 minutes
local start_time=$(date +%s)
while true; do
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
if [[ $elapsed -ge $timeout ]]; then
echo "${RED}Timeout waiting for SigNoz pods to become ready.${NC}"
break
fi
# Check pod status
local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep -c "Running" | tr -d '[:space:]' || echo "0")
local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d '[:space:]' || echo "0")
if [[ $ready_pods -eq 0 ]]; then
echo " Waiting for pods to start..."
else
echo " $ready_pods/$total_pods pods are running"
if [[ $ready_pods -eq $total_pods && $total_pods -gt 0 ]]; then
echo "${GREEN}All SigNoz pods are running!${NC}"
break
fi
fi
sleep 10
done
# Show deployment status
show_deployment_status
}
# Function to show deployment status # Function to show deployment status
show_deployment_status() { show_deployment_status() {
echo "" echo ""
@@ -322,30 +320,36 @@ show_deployment_status() {
# Function to show access information # Function to show access information
show_access_info() { show_access_info() {
echo "${BLUE}=== Access Information ===${NC}" echo "${BLUE}=== Access Information ===${NC}"
if [[ "$ENVIRONMENT" == "dev" ]]; then if [[ "$ENVIRONMENT" == "dev" ]]; then
echo "SigNoz UI: https://localhost/signoz" echo "SigNoz UI: http://monitoring.bakery-ia.local"
echo "SigNoz API: https://localhost/signoz-api"
echo "" echo ""
echo "OpenTelemetry Collector Endpoints:" echo "OpenTelemetry Collector Endpoints (from within cluster):"
echo " gRPC: localhost:4317" echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317"
echo " HTTP: localhost:4318" echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318"
echo " Metrics: localhost:8888" echo ""
echo "Port-forward for local access:"
echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080"
echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4317:4317"
echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4318:4318"
else else
echo "SigNoz UI: https://monitoring.bakewise.ai/signoz" echo "SigNoz UI: https://monitoring.bakewise.ai"
echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api"
echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts"
echo "" echo ""
echo "OpenTelemetry Collector Endpoints:" echo "OpenTelemetry Collector Endpoints (from within cluster):"
echo " gRPC: monitoring.bakewise.ai:4317" echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317"
echo " HTTP: monitoring.bakewise.ai:4318" echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318"
echo ""
echo "External endpoints (if exposed):"
echo " Check ingress configuration for external OTLP endpoints"
fi fi
echo "" echo ""
echo "Default credentials:" echo "Default credentials:"
echo " Username: admin" echo " Username: admin@example.com"
echo " Password: admin" echo " Password: admin"
echo "" echo ""
echo "Note: Change default password after first login!"
echo ""
} }
# Main execution # Main execution
@@ -368,6 +372,9 @@ main() {
exit 0 exit 0
fi fi
# Setup Helm repository
setup_helm_repo
# Create Docker Hub secret for image pulls # Create Docker Hub secret for image pulls
create_dockerhub_secret create_dockerhub_secret

View File

@@ -1,11 +1,13 @@
# SigNoz Helm Chart Values - Development Environment # SigNoz Helm Chart Values - Development Environment
# Optimized for local development with minimal resource usage # Optimized for local development with minimal resource usage
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress
# #
# Official Chart: https://github.com/SigNoz/charts # Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml # Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-dev.yaml
global: global:
storageClass: "standard" storageClass: "standard"
clusterName: "bakery-ia-dev"
domain: "monitoring.bakery-ia.local" domain: "monitoring.bakery-ia.local"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets: imagePullSecrets:
@@ -23,17 +25,10 @@ signoz:
type: ClusterIP type: ClusterIP
port: 8080 port: 8080
# DISABLE built-in ingress - using unified bakery-ingress instead
# Route configured in infrastructure/kubernetes/overlays/dev/dev-ingress.yaml
ingress: ingress:
enabled: true enabled: false
className: nginx
annotations: {}
hosts:
- host: monitoring.bakery-ia.local
paths:
- path: /
pathType: Prefix
port: 8080
tls: []
resources: resources:
requests: requests:
@@ -43,6 +38,17 @@ signoz:
cpu: 1000m cpu: 1000m
memory: 1Gi memory: 1Gi
# Environment variables (new format - replaces configVars)
env:
signoz_telemetrystore_provider: "clickhouse"
dot_metrics_enabled: "true"
signoz_emailing_enabled: "false"
signoz_alertmanager_provider: "signoz"
# Retention for dev (7 days)
signoz_traces_ttl_duration_hrs: "168"
signoz_metrics_ttl_duration_hrs: "168"
signoz_logs_ttl_duration_hrs: "168"
persistence: persistence:
enabled: true enabled: true
size: 5Gi size: 5Gi
@@ -92,6 +98,11 @@ clickhouse:
enabled: true enabled: true
installCustomStorageClass: false installCustomStorageClass: false
image:
registry: docker.io
repository: clickhouse/clickhouse-server
tag: 25.5.6 # Official recommended version
# Reduce ClickHouse resource requests for local dev # Reduce ClickHouse resource requests for local dev
clickhouse: clickhouse:
resources: resources:
@@ -102,15 +113,39 @@ clickhouse:
cpu: 1000m cpu: 1000m
memory: 1Gi memory: 1Gi
persistence:
enabled: true
size: 20Gi
# Zookeeper Configuration (required by ClickHouse) # Zookeeper Configuration (required by ClickHouse)
zookeeper: zookeeper:
enabled: true enabled: true
replicaCount: 1 # Single replica for dev
image:
tag: 3.7.1 # Official recommended version
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
persistence:
enabled: true
size: 5Gi
# OpenTelemetry Collector - Data ingestion endpoint for all telemetry # OpenTelemetry Collector - Data ingestion endpoint for all telemetry
otelCollector: otelCollector:
enabled: true enabled: true
replicaCount: 1 replicaCount: 1
image:
repository: signoz/signoz-otel-collector
tag: v0.129.12 # Latest recommended version
# Service configuration - expose both gRPC and HTTP endpoints # Service configuration - expose both gRPC and HTTP endpoints
service: service:
type: ClusterIP type: ClusterIP
@@ -130,6 +165,11 @@ otelCollector:
port: 8889 port: 8889
targetPort: 8889 targetPort: 8889
protocol: TCP protocol: TCP
# Metrics
- name: metrics
port: 8888
targetPort: 8888
protocol: TCP
resources: resources:
requests: requests:
@@ -210,10 +250,11 @@ otelCollector:
collection_interval: 60s collection_interval: 60s
processors: processors:
# Batch processor for better performance # Batch processor for better performance (optimized for high throughput)
batch: batch:
timeout: 10s timeout: 1s
send_batch_size: 1024 send_batch_size: 10000 # Increased from 1024 for better performance
send_batch_max_size: 10000
# Memory limiter to prevent OOM # Memory limiter to prevent OOM
memory_limiter: memory_limiter:
@@ -223,35 +264,57 @@ otelCollector:
# Resource detection # Resource detection
resourcedetection: resourcedetection:
detectors: [env, system] detectors: [env, system, docker]
timeout: 5s timeout: 5s
# Span metrics processor for automatic service metrics
spanmetrics:
metrics_exporter: signozclickhousemetrics
latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s]
dimensions_cache_size: 10000
exporters: exporters:
# ClickHouse exporter for traces # ClickHouse exporter for traces
clickhousetraces: clickhousetraces:
datasource: tcp://signoz-clickhouse:9000/?database=signoz_traces datasource: tcp://signoz-clickhouse:9000/?database=signoz_traces
timeout: 10s timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# ClickHouse exporter for metrics # ClickHouse exporter for metrics
signozclickhousemetrics: signozclickhousemetrics:
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics" dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
timeout: 10s timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# ClickHouse exporter for logs # ClickHouse exporter for logs
clickhouselogsexporter: clickhouselogsexporter:
dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs
timeout: 10s timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
# Debug exporter for debugging (optional) # Debug exporter for debugging (optional)
debug: debug:
verbosity: detailed verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
service: service:
pipelines: pipelines:
# Traces pipeline # Traces pipeline
traces: traces:
receivers: [otlp] receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection] processors: [memory_limiter, batch, spanmetrics, resourcedetection]
exporters: [clickhousetraces] exporters: [clickhousetraces]
# Metrics pipeline # Metrics pipeline

View File

@@ -1,11 +1,13 @@
# SigNoz Helm Chart Values - Production Environment # SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization # High-availability configuration with resource optimization
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
# #
# Official Chart: https://github.com/SigNoz/charts # Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml # Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
global: global:
storageClass: "standard" storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class
clusterName: "bakery-ia-prod"
domain: "monitoring.bakewise.ai" domain: "monitoring.bakewise.ai"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets: imagePullSecrets:
@@ -15,43 +17,33 @@ global:
imagePullSecrets: imagePullSecrets:
- dockerhub-creds - dockerhub-creds
# Frontend Configuration # SigNoz Main Component (unified frontend + query service)
frontend: # BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
signoz:
replicaCount: 2 replicaCount: 2
image: image:
repository: signoz/frontend repository: signoz/signoz
tag: 0.52.3 tag: v0.106.0 # Latest stable version
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
service: service:
type: ClusterIP type: ClusterIP
port: 3301 port: 8080 # HTTP/API port
internalPort: 8085 # Internal gRPC port
# DISABLE built-in ingress - using unified bakery-ingress-prod instead
# Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
ingress: ingress:
enabled: true enabled: false
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- host: monitoring.bakewise.ai
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls:
- secretName: signoz-tls
hosts:
- monitoring.bakewise.ai
resources: resources:
requests: requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m cpu: 500m
memory: 1Gi memory: 1Gi
limits:
cpu: 2000m
memory: 4Gi
# Pod Anti-affinity for HA # Pod Anti-affinity for HA
affinity: affinity:
@@ -60,58 +52,27 @@ frontend:
- weight: 100 - weight: 100
podAffinityTerm: podAffinityTerm:
labelSelector: labelSelector:
matchExpressions: matchLabels:
- key: app app.kubernetes.io/component: query-service
operator: In
values:
- signoz-frontend
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
# Environment variables (new format - replaces configVars)
env: env:
- name: FRONTEND_REFRESH_INTERVAL signoz_telemetrystore_provider: "clickhouse"
value: "30000" dot_metrics_enabled: "true"
signoz_emailing_enabled: "true"
# Query Service Configuration signoz_alertmanager_provider: "signoz"
queryService: # Retention configuration (30 days for prod)
replicaCount: 2 signoz_traces_ttl_duration_hrs: "720"
image: signoz_metrics_ttl_duration_hrs: "720"
repository: signoz/query-service signoz_logs_ttl_duration_hrs: "720"
tag: 0.52.3 # SMTP configuration for email alerts
pullPolicy: IfNotPresent signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
service: signoz_smtp_port: "587"
type: ClusterIP signoz_smtp_from: "alerts@bakewise.ai"
port: 8080 signoz_smtp_username: "alerts@bakewise.ai"
# Password should be set via secret: signoz_smtp_password
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-query-service
topologyKey: kubernetes.io/hostname
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
- name: RETENTION_DAYS
value: "30"
persistence: persistence:
enabled: true enabled: true
@@ -128,7 +89,9 @@ queryService:
# AlertManager Configuration # AlertManager Configuration
alertmanager: alertmanager:
enabled: true
replicaCount: 2 replicaCount: 2
image: image:
repository: signoz/alertmanager repository: signoz/alertmanager
tag: 0.23.5 tag: 0.23.5
@@ -140,11 +103,11 @@ alertmanager:
resources: resources:
requests: requests:
cpu: 250m cpu: 100m
memory: 512Mi memory: 128Mi
limits: limits:
cpu: 500m cpu: 500m
memory: 1Gi memory: 512Mi
# Pod Anti-affinity for HA # Pod Anti-affinity for HA
affinity: affinity:
@@ -210,24 +173,24 @@ alertmanager:
# ClickHouse Configuration - Time Series Database # ClickHouse Configuration - Time Series Database
clickhouse: clickhouse:
replicaCount: 2 enabled: true
installCustomStorageClass: false
image: image:
registry: docker.io
repository: clickhouse/clickhouse-server repository: clickhouse/clickhouse-server
tag: 24.1.2-alpine tag: 25.5.6 # Updated to official recommended version
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
service: # ClickHouse resources (nested config)
type: ClusterIP clickhouse:
httpPort: 8123 resources:
tcpPort: 9000 requests:
cpu: 1000m
resources: memory: 2Gi
requests: limits:
cpu: 1000m cpu: 4000m
memory: 2Gi memory: 8Gi
limits:
cpu: 2000m
memory: 4Gi
# Pod Anti-affinity for HA # Pod Anti-affinity for HA
affinity: affinity:
@@ -246,50 +209,63 @@ clickhouse:
size: 100Gi size: 100Gi
storageClass: "standard" storageClass: "standard"
# ClickHouse configuration # Cold storage configuration for better disk space management
config: coldStorage:
logger:
level: information
max_connections: 4096
max_concurrent_queries: 500
# Data retention (30 days for prod)
merge_tree:
parts_to_delay_insert: 150
parts_to_throw_insert: 300
# Performance tuning
max_memory_usage: 10000000000
max_bytes_before_external_group_by: 20000000000
# Backup configuration
backup:
enabled: true enabled: true
schedule: "0 2 * * *" defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free
retention: 7 ttl:
deleteTTLDays: 30 # Move old data to cold storage after 30 days
# Zookeeper Configuration (required by ClickHouse for coordination)
zookeeper:
enabled: true
replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA
image:
tag: 3.7.1 # Official recommended version
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
persistence:
enabled: true
size: 10Gi
storageClass: "standard"
# OpenTelemetry Collector - Integrated with SigNoz # OpenTelemetry Collector - Integrated with SigNoz
otelCollector: otelCollector:
enabled: true enabled: true
replicaCount: 2 replicaCount: 2
image: image:
repository: signoz/signoz-otel-collector repository: signoz/signoz-otel-collector
tag: 0.102.8 tag: v0.129.12 # Updated to latest recommended version
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
service: service:
type: ClusterIP type: ClusterIP
ports: ports:
otlpGrpc: 4317 - name: otlp-grpc
otlpHttp: 4318 port: 4317
metrics: 8888 - name: otlp-http
healthCheck: 13133 port: 4318
- name: metrics
port: 8888
- name: healthcheck
port: 13133
resources: resources:
requests: requests:
cpu: 500m cpu: 500m
memory: 512Mi memory: 512Mi
limits: limits:
cpu: 1000m cpu: 2000m
memory: 1Gi memory: 2Gi
# Full OTEL Collector Configuration # Full OTEL Collector Configuration
config: config:
@@ -304,7 +280,7 @@ otelCollector:
protocols: protocols:
grpc: grpc:
endpoint: 0.0.0.0:4317 endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 16 max_recv_msg_size_mib: 32 # Increased for larger payloads
http: http:
endpoint: 0.0.0.0:4318 endpoint: 0.0.0.0:4318
cors: cors:
@@ -322,19 +298,20 @@ otelCollector:
- targets: ['localhost:8888'] - targets: ['localhost:8888']
processors: processors:
# High-performance batch processing (official recommendation)
batch: batch:
timeout: 10s timeout: 1s # Reduced from 10s for faster processing
send_batch_size: 2048 send_batch_size: 50000 # Increased from 2048 (official recommendation for traces)
send_batch_max_size: 4096 send_batch_max_size: 50000
memory_limiter: memory_limiter:
check_interval: 1s check_interval: 1s
limit_mib: 800 limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi)
spike_limit_mib: 200 spike_limit_mib: 300
# Resource detection for K8s # Resource detection for K8s
resourcedetection: resourcedetection:
detectors: [env, system, docker] detectors: [env, system, docker, kubernetes]
timeout: 5s timeout: 5s
# Add resource attributes # Add resource attributes
@@ -347,6 +324,12 @@ otelCollector:
value: bakery-ia-prod value: bakery-ia-prod
action: upsert action: upsert
# Span metrics processor for automatic service performance metrics
spanmetrics:
metrics_exporter: signozclickhousemetrics
latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s]
dimensions_cache_size: 100000
exporters: exporters:
# Export to SigNoz ClickHouse # Export to SigNoz ClickHouse
clickhousetraces: clickhousetraces:
@@ -387,8 +370,8 @@ otelCollector:
pipelines: pipelines:
traces: traces:
receivers: [otlp] receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource] processors: [memory_limiter, batch, spanmetrics, resourcedetection, resource]
exporters: [clickhousetraces, debug] exporters: [clickhousetraces]
metrics: metrics:
receivers: [otlp, prometheus] receivers: [otlp, prometheus]
@@ -398,12 +381,7 @@ otelCollector:
logs: logs:
receivers: [otlp] receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource] processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, debug] exporters: [clickhouselogsexporter]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:
enabled: true
mode: deployment
# HPA for OTEL Collector # HPA for OTEL Collector
autoscaling: autoscaling:
@@ -413,29 +391,18 @@ otelCollectorDeployment:
targetCPUUtilizationPercentage: 70 targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80 targetMemoryUtilizationPercentage: 80
# Node Exporter for infrastructure metrics # Schema Migrator - Manages ClickHouse schema migrations
nodeExporter: schemaMigrator:
enabled: true enabled: true
service:
type: ClusterIP
port: 9100
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Schemamanager - Manages ClickHouse schema
schemamanager:
enabled: true
image: image:
repository: signoz/signoz-schema-migrator repository: signoz/signoz-schema-migrator
tag: 0.52.3 tag: v0.129.12 # Updated to latest version
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Enable Helm hooks for proper upgrade handling
upgradeHelmHooks: true
# Additional Configuration # Additional Configuration
serviceAccount: serviceAccount:
create: true create: true

View File

@@ -26,7 +26,7 @@ show_help() {
echo "" echo ""
echo "Options: echo "Options:
-h, --help Show this help message -h, --help Show this help message
-n, --namespace NAMESPACE Specify namespace (default: signoz)" -n, --namespace NAMESPACE Specify namespace (default: bakery-ia)"
echo "" echo ""
echo "Examples: echo "Examples:
$0 dev # Verify development deployment $0 dev # Verify development deployment
@@ -35,7 +35,7 @@ show_help() {
} }
# Parse command line arguments # Parse command line arguments
NAMESPACE="signoz" NAMESPACE="bakery-ia"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
@@ -224,20 +224,28 @@ verify_deployment() {
# Function for development-specific verification # Function for development-specific verification
verify_dev_specific() { verify_dev_specific() {
echo "${BLUE}8. Development-specific checks...${NC}" echo "${BLUE}8. Development-specific checks...${NC}"
# Check if localhost ingress is configured # Check if ingress is configured
if kubectl get ingress -n "$NAMESPACE" | grep -q "localhost"; then if kubectl get ingress -n "$NAMESPACE" 2>/dev/null | grep -q "monitoring.bakery-ia.local"; then
echo "${GREEN}Localhost ingress configured${NC}" echo "${GREEN}Development ingress configured${NC}"
else else
echo "${YELLOW}⚠️ Localhost ingress not found${NC}" echo "${YELLOW}⚠️ Development ingress not found${NC}"
fi fi
# Check resource limits (should be lower for dev) # Check unified signoz component resource limits (should be lower for dev)
local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") local signoz_mem=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "")
if [[ -n "$query_service" && "$query_service" == "512Mi" ]]; then if [[ -n "$signoz_mem" ]]; then
echo "${GREEN}Development resource limits applied${NC}" echo "${GREEN}SigNoz component found (memory limit: $signoz_mem)${NC}"
else else
echo "${YELLOW}⚠️ Resource limits may not be optimized for development${NC}" echo "${YELLOW}⚠️ Could not verify SigNoz component resources${NC}"
fi
# Check single replica setup for dev
local replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "0")
if [[ $replicas -eq 1 ]]; then
echo "${GREEN}✅ Single replica configuration (appropriate for dev)${NC}"
else
echo "${YELLOW}⚠️ Multiple replicas detected (replicas: $replicas)${NC}"
fi fi
echo "" echo ""
} }
@@ -245,28 +253,54 @@ verify_dev_specific() {
# Function for production-specific verification # Function for production-specific verification
verify_prod_specific() { verify_prod_specific() {
echo "${BLUE}8. Production-specific checks...${NC}" echo "${BLUE}8. Production-specific checks...${NC}"
# Check if TLS is configured # Check if TLS is configured
if kubectl get ingress -n "$NAMESPACE" | grep -q "signoz-tls-cert"; then if kubectl get ingress -n "$NAMESPACE" 2>/dev/null | grep -q "signoz-tls"; then
echo "${GREEN}✅ TLS certificate configured${NC}" echo "${GREEN}✅ TLS certificate configured${NC}"
else else
echo "${YELLOW}⚠️ TLS certificate not found${NC}" echo "${YELLOW}⚠️ TLS certificate not found${NC}"
fi fi
# Check if multiple replicas are running # Check if multiple replicas are running for HA
local query_replicas=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1") local signoz_replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "1")
if [[ $query_replicas -gt 1 ]]; then if [[ $signoz_replicas -gt 1 ]]; then
echo "${GREEN}✅ High availability configured ($query_replicas replicas)${NC}" echo "${GREEN}✅ High availability configured ($signoz_replicas SigNoz replicas)${NC}"
else else
echo "${YELLOW}⚠️ Single replica detected (not highly available)${NC}" echo "${YELLOW}⚠️ Single SigNoz replica detected (not highly available)${NC}"
fi fi
# Check resource limits (should be higher for prod) # Check Zookeeper replicas (critical for production)
local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") local zk_replicas=$(kubectl get statefulset -n "$NAMESPACE" -l app.kubernetes.io/component=zookeeper -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "0")
if [[ -n "$query_service" && "$query_service" == "2Gi" ]]; then if [[ $zk_replicas -eq 3 ]]; then
echo "${GREEN}Production resource limits applied${NC}" echo "${GREEN}Zookeeper properly configured with 3 replicas${NC}"
elif [[ $zk_replicas -gt 0 ]]; then
echo "${YELLOW}⚠️ Zookeeper has $zk_replicas replicas (recommend 3 for production)${NC}"
else else
echo "${YELLOW}⚠️ Resource limits may not be optimized for production${NC}" echo "${RED}❌ Zookeeper not found${NC}"
fi
# Check OTel Collector replicas
local otel_replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=otel-collector -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "1")
if [[ $otel_replicas -gt 1 ]]; then
echo "${GREEN}✅ OTel Collector HA configured ($otel_replicas replicas)${NC}"
else
echo "${YELLOW}⚠️ Single OTel Collector replica${NC}"
fi
# Check resource limits (should be higher for prod)
local signoz_mem=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "")
if [[ -n "$signoz_mem" ]]; then
echo "${GREEN}✅ Production resource limits applied (memory: $signoz_mem)${NC}"
else
echo "${YELLOW}⚠️ Could not verify resource limits${NC}"
fi
# Check HPA (Horizontal Pod Autoscaler)
local hpa_count=$(kubectl get hpa -n "$NAMESPACE" 2>/dev/null | grep -c signoz || echo "0")
if [[ $hpa_count -gt 0 ]]; then
echo "${GREEN}✅ Horizontal Pod Autoscaler configured${NC}"
else
echo "${YELLOW}⚠️ No HPA found (consider enabling for production)${NC}"
fi fi
echo "" echo ""
} }
@@ -278,39 +312,50 @@ show_access_info() {
echo "📋 Access Information" echo "📋 Access Information"
echo "==========================================" echo "=========================================="
echo "${NC}" echo "${NC}"
if [[ "$ENVIRONMENT" == "dev" ]]; then if [[ "$ENVIRONMENT" == "dev" ]]; then
echo "SigNoz UI: https://localhost/signoz" echo "SigNoz UI: http://monitoring.bakery-ia.local"
echo "SigNoz API: https://localhost/signoz-api"
echo "" echo ""
echo "OpenTelemetry Collector:" echo "OpenTelemetry Collector (within cluster):"
echo " gRPC: localhost:4317" echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317"
echo " HTTP: localhost:4318" echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318"
echo " Metrics: localhost:8888" echo ""
echo "Port-forward for local access:"
echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080"
echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4317:4317"
echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4318:4318"
else else
echo "SigNoz UI: https://monitoring.bakewise.ai/signoz" echo "SigNoz UI: https://monitoring.bakewise.ai"
echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api"
echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts"
echo "" echo ""
echo "OpenTelemetry Collector:" echo "OpenTelemetry Collector (within cluster):"
echo " gRPC: monitoring.bakewise.ai:4317" echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317"
echo " HTTP: monitoring.bakewise.ai:4318" echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318"
fi fi
echo "" echo ""
echo "Default Credentials:" echo "Default Credentials:"
echo " Username: admin" echo " Username: admin@example.com"
echo " Password: admin" echo " Password: admin"
echo "" echo ""
echo "⚠️ IMPORTANT: Change default password after first login!"
echo ""
# Show connection test commands # Show connection test commands
echo "Connection Test Commands:" echo "Connection Test Commands:"
if [[ "$ENVIRONMENT" == "dev" ]]; then if [[ "$ENVIRONMENT" == "dev" ]]; then
echo " curl -k https://localhost/signoz" echo " # Test SigNoz UI"
echo " curl -k https://localhost/signoz-api/health" echo " curl http://monitoring.bakery-ia.local"
echo ""
echo " # Test via port-forward"
echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080"
echo " curl http://localhost:8080"
else else
echo " curl https://monitoring.bakewise.ai/signoz" echo " # Test SigNoz UI"
echo " curl https://monitoring.bakewise.ai/signoz-api/health" echo " curl https://monitoring.bakewise.ai"
echo ""
echo " # Test API health"
echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080"
echo " curl http://localhost:8080/api/v1/health"
fi fi
echo "" echo ""
} }
@@ -322,36 +367,43 @@ run_connectivity_tests() {
echo "🔗 Running Connectivity Tests" echo "🔗 Running Connectivity Tests"
echo "==========================================" echo "=========================================="
echo "${NC}" echo "${NC}"
if [[ "$ENVIRONMENT" == "dev" ]]; then # Test pod readiness first
# Test frontend echo "Checking pod readiness..."
echo "Testing SigNoz frontend..." local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep "Running" | grep -c "1/1\|2/2" || echo "0")
if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz | grep -q "200\|302"; then local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0")
echo "${GREEN}✅ Frontend accessible${NC}"
else if [[ $ready_pods -eq $total_pods && $total_pods -gt 0 ]]; then
echo "${RED}❌ Frontend not accessible${NC}" echo "${GREEN}✅ All pods are ready ($ready_pods/$total_pods)${NC}"
fi
# Test API
echo "Testing SigNoz API..."
if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz-api/health | grep -q "200"; then
echo "${GREEN}✅ API accessible${NC}"
else
echo "${RED}❌ API not accessible${NC}"
fi
# Test OTEL collector
echo "Testing OpenTelemetry collector..."
if curl -s -o /dev/null -w "%{http_code}" http://localhost:8888/metrics | grep -q "200"; then
echo "${GREEN}✅ OTEL collector accessible${NC}"
else
echo "${YELLOW}⚠️ OTEL collector not accessible (may not be exposed)${NC}"
fi
else else
echo "${YELLOW}⚠️ Production connectivity tests require valid DNS and TLS${NC}" echo "${YELLOW}⚠️ Some pods not ready ($ready_pods/$total_pods)${NC}"
echo " Please ensure monitoring.bakewise.ai resolves to your cluster"
fi fi
echo "" echo ""
# Test internal service connectivity
echo "Testing internal service connectivity..."
local signoz_svc=$(kubectl get svc -n "$NAMESPACE" signoz -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "")
if [[ -n "$signoz_svc" ]]; then
echo "${GREEN}✅ SigNoz service accessible at $signoz_svc:8080${NC}"
else
echo "${RED}❌ SigNoz service not found${NC}"
fi
local otel_svc=$(kubectl get svc -n "$NAMESPACE" signoz-otel-collector -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "")
if [[ -n "$otel_svc" ]]; then
echo "${GREEN}✅ OTel Collector service accessible at $otel_svc:4317 (gRPC), $otel_svc:4318 (HTTP)${NC}"
else
echo "${RED}❌ OTel Collector service not found${NC}"
fi
echo ""
if [[ "$ENVIRONMENT" == "prod" ]]; then
echo "${YELLOW}⚠️ Production connectivity tests require valid DNS and TLS${NC}"
echo " Please ensure monitoring.bakewise.ai resolves to your cluster"
echo ""
echo "Manual test:"
echo " curl -I https://monitoring.bakewise.ai"
fi
} }
# Main execution # Main execution

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "ai-insights-service" value: "ai-insights-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -98,9 +98,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "auth-service" value: "auth-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -62,9 +62,9 @@ spec:
value: "3" value: "3"
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "distribution-service" value: "distribution-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -90,9 +90,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "external-service" value: "external-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "forecasting-service" value: "forecasting-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -52,7 +52,7 @@ spec:
name: whatsapp-secrets name: whatsapp-secrets
env: env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
resources: resources:
requests: requests:
memory: "256Mi" memory: "256Mi"

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "inventory-service" value: "inventory-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "notification-service" value: "notification-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "orchestrator-service" value: "orchestrator-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "orders-service" value: "orders-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "pos-service" value: "pos-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "procurement-service" value: "procurement-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "production-service" value: "production-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "recipes-service" value: "recipes-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "sales-service" value: "sales-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "suppliers-service" value: "suppliers-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "tenant-service" value: "tenant-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -97,9 +97,9 @@ spec:
env: env:
# OpenTelemetry Configuration # OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT - name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT - name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME - name: OTEL_SERVICE_NAME
value: "training-service" value: "training-service"
- name: ENABLE_TRACING - name: ENABLE_TRACING

View File

@@ -385,13 +385,13 @@ data:
# OBSERVABILITY - SigNoz (Unified Monitoring) # OBSERVABILITY - SigNoz (Unified Monitoring)
# ================================================================ # ================================================================
# OpenTelemetry Configuration - Direct to SigNoz # OpenTelemetry Configuration - Direct to SigNoz
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_SERVICE_NAME: "bakery-ia" OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
# SigNoz Endpoints (v0.106.0+ unified service) # SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local" SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
# ================================================================ # ================================================================

View File

@@ -73,7 +73,14 @@ spec:
name: gateway-service name: gateway-service
port: port:
number: 8000 number: 8000
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace # SigNoz Monitoring on subdomain (deployed via Helm in bakery-ia namespace)
# SigNoz creates its own Ingress via Helm chart configuration (signoz-values-dev.yaml) - host: monitoring.bakery-ia.local
# Access at: https://monitoring.bakery-ia.local/ http:
# SignOz is served at the root of the monitoring subdomain paths:
- path: /
pathType: Prefix
backend:
service:
name: signoz
port:
number: 8080

View File

@@ -61,7 +61,7 @@ patches:
value: "true" value: "true"
- op: add - op: add
path: /data/OTEL_EXPORTER_OTLP_ENDPOINT path: /data/OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
- op: add - op: add
path: /data/OTEL_EXPORTER_OTLP_PROTOCOL path: /data/OTEL_EXPORTER_OTLP_PROTOCOL
value: "grpc" value: "grpc"

View File

@@ -23,13 +23,13 @@ data:
ENABLE_LOGS: "true" ENABLE_LOGS: "true"
# OpenTelemetry Configuration - Direct to SigNoz # OpenTelemetry Configuration - Direct to SigNoz
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_SERVICE_NAME: "bakery-ia" OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
# SigNoz Endpoints (v0.106.0+ unified service) # SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai" SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai" SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai"

View File

@@ -41,6 +41,7 @@ spec:
tls: tls:
- hosts: - hosts:
- bakewise.ai - bakewise.ai
- monitoring.bakewise.ai
secretName: bakery-ia-prod-tls-cert secretName: bakery-ia-prod-tls-cert
rules: rules:
- host: bakewise.ai - host: bakewise.ai
@@ -60,6 +61,14 @@ spec:
name: gateway-service name: gateway-service
port: port:
number: 8000 number: 8000
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace # SigNoz Monitoring on subdomain (deployed via Helm in bakery-ia namespace)
# SigNoz creates its own Ingress via Helm chart configuration - host: monitoring.bakewise.ai
# Access at: https://monitoring.bakewise.ai (configured in signoz-values-prod.yaml) http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: signoz
port:
number: 8080

View File

@@ -30,7 +30,7 @@ def setup_tracing(service_name: str = "ai-insights"):
resource = Resource.create({"service.name": service_name}) resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter( otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True insecure=True
) )

View File

@@ -35,7 +35,7 @@ def setup_tracing(service_name: str = "alert-processor"):
resource = Resource.create({"service.name": service_name}) resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter( otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True insecure=True
) )

View File

@@ -33,7 +33,7 @@ def setup_tracing(service_name: str = "demo-session"):
resource = Resource.create({"service.name": service_name}) resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter( otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True insecure=True
) )

View File

@@ -68,7 +68,7 @@ def setup_otel_logging(
if otel_endpoint is None: if otel_endpoint is None:
otel_endpoint = os.getenv( otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT", "OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318") os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
) )
# Ensure endpoint has /v1/logs path for HTTP # Ensure endpoint has /v1/logs path for HTTP

View File

@@ -69,7 +69,7 @@ def setup_otel_metrics(
if otel_endpoint is None: if otel_endpoint is None:
otel_endpoint = os.getenv( otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT", "OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318") os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
) )
# Ensure endpoint has /v1/metrics path for HTTP # Ensure endpoint has /v1/metrics path for HTTP

View File

@@ -22,7 +22,7 @@ def setup_tracing(
app, app,
service_name: str, service_name: str,
service_version: str = "1.0.0", service_version: str = "1.0.0",
otel_endpoint: str = "http://signoz-otel-collector.signoz:4318" otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
): ):
""" """
Setup OpenTelemetry distributed tracing for a FastAPI service. Setup OpenTelemetry distributed tracing for a FastAPI service.

View File

@@ -151,7 +151,7 @@ class BaseFastAPIService:
try: try:
otel_endpoint = os.getenv( otel_endpoint = os.getenv(
"OTEL_COLLECTOR_ENDPOINT", "OTEL_COLLECTOR_ENDPOINT",
"http://signoz-otel-collector.signoz:4318" "http://signoz-otel-collector.bakery-ia:4318"
) )
setup_tracing(self.app, self.service_name, self.version, otel_endpoint) setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
self.logger.info(f"Distributed tracing enabled for {self.service_name}") self.logger.info(f"Distributed tracing enabled for {self.service_name}")