Improve docker config
This commit is contained in:
@@ -1,11 +1,15 @@
|
||||
# infrastructure/monitoring/grafana/dashboards/dashboard.yml
|
||||
# Grafana dashboard provisioning
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Bakery Forecasting'
|
||||
- name: 'bakery-dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
folder: 'Bakery Forecasting'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
@@ -1,3 +1,6 @@
|
||||
# infrastructure/monitoring/grafana/datasources/prometheus.yml
|
||||
# Grafana Prometheus datasource configuration
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
@@ -6,4 +9,20 @@ datasources:
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
version: 1
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
queryTimeout: "60s"
|
||||
httpMethod: "POST"
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: jaeger
|
||||
|
||||
- name: Jaeger
|
||||
type: jaeger
|
||||
access: proxy
|
||||
url: http://jaeger:16686
|
||||
uid: jaeger
|
||||
version: 1
|
||||
editable: true
|
||||
@@ -1,17 +1,30 @@
|
||||
---
|
||||
# infrastructure/monitoring/prometheus/prometheus.yml
|
||||
# Prometheus configuration
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'bakery-forecasting'
|
||||
replica: 'prometheus-01'
|
||||
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
- "/etc/prometheus/rules/*.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Service discovery for microservices
|
||||
- job_name: 'gateway'
|
||||
static_configs:
|
||||
- targets: ['gateway:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
- job_name: 'auth-service'
|
||||
static_configs:
|
||||
@@ -49,11 +62,21 @@ scrape_configs:
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Infrastructure monitoring
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'rabbitmq'
|
||||
static_configs:
|
||||
- targets: ['rabbitmq:15692']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Database monitoring (requires postgres_exporter)
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
scrape_interval: 30s
|
||||
86
infrastructure/monitoring/prometheus/rules/alerts.yml
Normal file
86
infrastructure/monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,86 @@
|
||||
# infrastructure/monitoring/prometheus/rules/alerts.yml
|
||||
# Prometheus alerting rules
|
||||
|
||||
groups:
|
||||
- name: bakery_services
|
||||
rules:
|
||||
# Service availability alerts
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
# High error rate alerts
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
|
||||
|
||||
# High response time alerts
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time on {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
|
||||
|
||||
# Memory usage alerts
|
||||
- alert: HighMemoryUsage
|
||||
expr: process_resident_memory_bytes / 1024 / 1024 > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.job }}"
|
||||
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
|
||||
|
||||
# Database connection alerts
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High database connections"
|
||||
description: "Database has {{ $value }} active connections."
|
||||
|
||||
- name: bakery_business
|
||||
rules:
|
||||
# Training job alerts
|
||||
- alert: TrainingJobFailed
|
||||
expr: increase(training_jobs_failed_total[1h]) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Training job failed"
|
||||
description: "{{ $value }} training jobs have failed in the last hour."
|
||||
|
||||
# Prediction accuracy alerts
|
||||
- alert: LowPredictionAccuracy
|
||||
expr: prediction_accuracy < 0.7
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low prediction accuracy"
|
||||
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
|
||||
|
||||
# API rate limit alerts
|
||||
- alert: APIRateLimitHit
|
||||
expr: increase(rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API rate limit hit frequently"
|
||||
description: "Rate limit has been hit {{ $value }} times in 5 minutes."
|
||||
6
infrastructure/pgadmin/pgpass
Normal file
6
infrastructure/pgadmin/pgpass
Normal file
@@ -0,0 +1,6 @@
|
||||
auth-db:5432:auth_db:auth_user:auth_pass123
|
||||
training-db:5432:training_db:training_user:training_pass123
|
||||
forecasting-db:5432:forecasting_db:forecasting_user:forecasting_pass123
|
||||
data-db:5432:data_db:data_user:data_pass123
|
||||
tenant-db:5432:tenant_db:tenant_user:tenant_pass123
|
||||
notification-db:5432:notification_db:notification_user:notification_pass123
|
||||
64
infrastructure/pgadmin/servers.json
Normal file
64
infrastructure/pgadmin/servers.json
Normal file
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"Servers": {
|
||||
"1": {
|
||||
"Name": "Auth Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "auth-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "auth_db",
|
||||
"Username": "auth_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
},
|
||||
"2": {
|
||||
"Name": "Training Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "training-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "training_db",
|
||||
"Username": "training_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
},
|
||||
"3": {
|
||||
"Name": "Forecasting Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "forecasting-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "forecasting_db",
|
||||
"Username": "forecasting_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
},
|
||||
"4": {
|
||||
"Name": "Data Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "data-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "data_db",
|
||||
"Username": "data_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
},
|
||||
"5": {
|
||||
"Name": "Tenant Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "tenant-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "tenant_db",
|
||||
"Username": "tenant_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
},
|
||||
"6": {
|
||||
"Name": "Notification Database",
|
||||
"Group": "Bakery Services",
|
||||
"Host": "notification-db",
|
||||
"Port": 5432,
|
||||
"MaintenanceDB": "notification_db",
|
||||
"Username": "notification_user",
|
||||
"PassFile": "/pgadmin4/pgpass",
|
||||
"SSLMode": "prefer"
|
||||
}
|
||||
}
|
||||
}
|
||||
26
infrastructure/postgres/init-scripts/init.sql
Normal file
26
infrastructure/postgres/init-scripts/init.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
-- Create extensions for all databases
|
||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||
CREATE EXTENSION IF NOT EXISTS "pg_stat_statements";
|
||||
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
|
||||
|
||||
-- Create Spanish collation for proper text sorting
|
||||
-- This will be used for bakery names, product names, etc.
|
||||
-- CREATE COLLATION IF NOT EXISTS spanish (provider = icu, locale = 'es-ES');
|
||||
|
||||
-- Set timezone to Madrid
|
||||
SET timezone = 'Europe/Madrid';
|
||||
|
||||
-- Performance tuning for small to medium databases
|
||||
ALTER SYSTEM SET shared_preload_libraries = 'pg_stat_statements';
|
||||
ALTER SYSTEM SET max_connections = 100;
|
||||
ALTER SYSTEM SET shared_buffers = '256MB';
|
||||
ALTER SYSTEM SET effective_cache_size = '1GB';
|
||||
ALTER SYSTEM SET maintenance_work_mem = '64MB';
|
||||
ALTER SYSTEM SET checkpoint_completion_target = 0.9;
|
||||
ALTER SYSTEM SET wal_buffers = '16MB';
|
||||
ALTER SYSTEM SET default_statistics_target = 100;
|
||||
ALTER SYSTEM SET random_page_cost = 1.1;
|
||||
ALTER SYSTEM SET effective_io_concurrency = 200;
|
||||
|
||||
-- Reload configuration
|
||||
SELECT pg_reload_conf();
|
||||
94
infrastructure/rabbitmq/definitions.json
Normal file
94
infrastructure/rabbitmq/definitions.json
Normal file
@@ -0,0 +1,94 @@
|
||||
{
|
||||
"rabbit_version": "3.12.0",
|
||||
"rabbitmq_version": "3.12.0",
|
||||
"product_name": "RabbitMQ",
|
||||
"product_version": "3.12.0",
|
||||
"users": [
|
||||
{
|
||||
"name": "bakery",
|
||||
"password_hash": "hash_of_forecast123",
|
||||
"hashing_algorithm": "rabbit_password_hashing_sha256",
|
||||
"tags": ["administrator"]
|
||||
}
|
||||
],
|
||||
"vhosts": [
|
||||
{
|
||||
"name": "/"
|
||||
}
|
||||
],
|
||||
"permissions": [
|
||||
{
|
||||
"user": "bakery",
|
||||
"vhost": "/",
|
||||
"configure": ".*",
|
||||
"write": ".*",
|
||||
"read": ".*"
|
||||
}
|
||||
],
|
||||
"exchanges": [
|
||||
{
|
||||
"name": "bakery_events",
|
||||
"vhost": "/",
|
||||
"type": "topic",
|
||||
"durable": true,
|
||||
"auto_delete": false,
|
||||
"internal": false,
|
||||
"arguments": {}
|
||||
}
|
||||
],
|
||||
"queues": [
|
||||
{
|
||||
"name": "training_events",
|
||||
"vhost": "/",
|
||||
"durable": true,
|
||||
"auto_delete": false,
|
||||
"arguments": {
|
||||
"x-message-ttl": 86400000
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "forecasting_events",
|
||||
"vhost": "/",
|
||||
"durable": true,
|
||||
"auto_delete": false,
|
||||
"arguments": {
|
||||
"x-message-ttl": 86400000
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "notification_events",
|
||||
"vhost": "/",
|
||||
"durable": true,
|
||||
"auto_delete": false,
|
||||
"arguments": {
|
||||
"x-message-ttl": 86400000
|
||||
}
|
||||
}
|
||||
],
|
||||
"bindings": [
|
||||
{
|
||||
"source": "bakery_events",
|
||||
"vhost": "/",
|
||||
"destination": "training_events",
|
||||
"destination_type": "queue",
|
||||
"routing_key": "training.*",
|
||||
"arguments": {}
|
||||
},
|
||||
{
|
||||
"source": "bakery_events",
|
||||
"vhost": "/",
|
||||
"destination": "forecasting_events",
|
||||
"destination_type": "queue",
|
||||
"routing_key": "forecasting.*",
|
||||
"arguments": {}
|
||||
},
|
||||
{
|
||||
"source": "bakery_events",
|
||||
"vhost": "/",
|
||||
"destination": "notification_events",
|
||||
"destination_type": "queue",
|
||||
"routing_key": "notification.*",
|
||||
"arguments": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
26
infrastructure/rabbitmq/rabbitmq.conf
Normal file
26
infrastructure/rabbitmq/rabbitmq.conf
Normal file
@@ -0,0 +1,26 @@
|
||||
# infrastructure/rabbitmq/rabbitmq.conf
|
||||
# RabbitMQ configuration file
|
||||
|
||||
# Network settings
|
||||
listeners.tcp.default = 5672
|
||||
management.tcp.port = 15672
|
||||
|
||||
# Memory and disk thresholds
|
||||
vm_memory_high_watermark.relative = 0.6
|
||||
disk_free_limit.relative = 2.0
|
||||
|
||||
# Default user (will be overridden by environment variables)
|
||||
default_user = bakery
|
||||
default_pass = forecast123
|
||||
default_vhost = /
|
||||
|
||||
# Management plugin
|
||||
management.load_definitions = /etc/rabbitmq/definitions.json
|
||||
|
||||
# Logging
|
||||
log.console = true
|
||||
log.console.level = info
|
||||
log.file = false
|
||||
|
||||
# Queue settings
|
||||
queue_master_locator = min-masters
|
||||
51
infrastructure/redis/redis.conf
Normal file
51
infrastructure/redis/redis.conf
Normal file
@@ -0,0 +1,51 @@
|
||||
# infrastructure/redis/redis.conf
|
||||
# Redis configuration file
|
||||
|
||||
# Network settings
|
||||
bind 0.0.0.0
|
||||
port 6379
|
||||
timeout 300
|
||||
tcp-keepalive 300
|
||||
|
||||
# General settings
|
||||
daemonize no
|
||||
supervised no
|
||||
pidfile /var/run/redis_6379.pid
|
||||
loglevel notice
|
||||
logfile ""
|
||||
|
||||
# Persistence settings
|
||||
save 900 1
|
||||
save 300 10
|
||||
save 60 10000
|
||||
stop-writes-on-bgsave-error yes
|
||||
rdbcompression yes
|
||||
rdbchecksum yes
|
||||
dbfilename dump.rdb
|
||||
dir ./
|
||||
|
||||
# Append only file settings
|
||||
appendonly yes
|
||||
appendfilename "appendonly.aof"
|
||||
appendfsync everysec
|
||||
no-appendfsync-on-rewrite no
|
||||
auto-aof-rewrite-percentage 100
|
||||
auto-aof-rewrite-min-size 64mb
|
||||
aof-load-truncated yes
|
||||
|
||||
# Memory management
|
||||
maxmemory 512mb
|
||||
maxmemory-policy allkeys-lru
|
||||
maxmemory-samples 5
|
||||
|
||||
# Security
|
||||
requirepass redis_pass123
|
||||
|
||||
# Slow log
|
||||
slowlog-log-slower-than 10000
|
||||
slowlog-max-len 128
|
||||
|
||||
# Client output buffer limits
|
||||
client-output-buffer-limit normal 0 0 0
|
||||
client-output-buffer-limit replica 256mb 64mb 60
|
||||
client-output-buffer-limit pubsub 32mb 8mb 60
|
||||
Reference in New Issue
Block a user