Imporve monitoring

This commit is contained in:
Urtzi Alfaro
2026-01-09 06:57:18 +01:00
parent e8fda39e50
commit 4af860c010
16 changed files with 333 additions and 635 deletions

View File

@@ -360,18 +360,6 @@ class DemoCleanupService:
logger.info("Demo session cleanup completed", stats=stats)
# Update Prometheus metrics
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
demo_session_cleanup_duration_seconds.labels(tier="all").observe(duration_ms / 1000)
# Update deleted sessions metrics by tier (we need to determine tiers from sessions)
for session in all_sessions_to_cleanup:
demo_sessions_deleted_total.labels(
tier=session.demo_account_type,
status="success"
).inc()
demo_sessions_active.labels(tier=session.demo_account_type).dec()
return stats
async def cleanup_old_destroyed_sessions(self, days: int = 7) -> int:

View File

@@ -284,9 +284,7 @@ class CloneOrchestrator:
)
start_time = datetime.now(timezone.utc)
# Update active sessions metric
demo_sessions_active.labels(tier=demo_account_type).inc()
# Filter services if specified
services_to_clone = self.services
@@ -383,29 +381,6 @@ class CloneOrchestrator:
services_status=all_services,
demo_account_type=demo_account_type
)
# Update Prometheus metrics
demo_session_creation_duration_seconds.labels(tier=demo_account_type).observe(duration_ms / 1000)
demo_sessions_created_total.labels(tier=demo_account_type, status=overall_status).inc()
# Update alert and insight metrics if available
if result.get("alert_generation"):
alert_gen = result["alert_generation"]
for alert_type, alerts in alert_gen.items():
if isinstance(alerts, dict) and alerts.get("alerts_generated"):
demo_alerts_generated_total.labels(
tier=demo_account_type,
alert_type=alert_type
).inc(alerts["alerts_generated"])
if result.get("ai_insights_generation"):
insights_gen = result["ai_insights_generation"]
for insight_type, insights in insights_gen.items():
if isinstance(insights, dict) and insights.get("insights_posted"):
demo_ai_insights_generated_total.labels(
tier=demo_account_type,
insight_type=insight_type
).inc(insights["insights_posted"])
return result
@@ -549,20 +524,6 @@ class CloneOrchestrator:
duration_ms=duration_ms
)
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="success"
).inc()
demo_cross_service_call_duration_seconds.labels(
source_service="demo-session",
target_service=service.name
).observe(duration_seconds)
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
if response.status_code == 200:
result = response.json()
logger.info(
@@ -582,17 +543,6 @@ class CloneOrchestrator:
response_text=response.text
)
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="http_error"
).inc()
return {
"service": service.name,
"status": "failed",
@@ -614,22 +564,6 @@ class CloneOrchestrator:
url=service.url
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="timeout"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",
@@ -650,22 +584,6 @@ class CloneOrchestrator:
exc_info=True
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="network_error"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",
@@ -686,22 +604,6 @@ class CloneOrchestrator:
exc_info=True
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="exception"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",