Initial commit - production deployment

This commit is contained in:
2026-01-21 17:17:16 +01:00
commit c23d00dd92
2289 changed files with 638440 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""WebSocket support for training service"""
from app.websocket.manager import websocket_manager, WebSocketConnectionManager
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
__all__ = [
'websocket_manager',
'WebSocketConnectionManager',
'setup_websocket_event_consumer',
'cleanup_websocket_consumers'
]

View File

@@ -0,0 +1,148 @@
"""
RabbitMQ Event Consumer for WebSocket Broadcasting
Listens to training events from RabbitMQ and broadcasts them to WebSocket clients
"""
import asyncio
import json
from typing import Dict, Set
import structlog
from app.websocket.manager import websocket_manager
from app.services.training_events import training_publisher
logger = structlog.get_logger()
# Track active consumers
_active_consumers: Set[asyncio.Task] = set()
async def handle_training_event(message) -> None:
"""
Handle incoming RabbitMQ training events and broadcast to WebSocket clients.
This is the bridge between RabbitMQ and WebSocket.
"""
try:
# Parse message
body = message.body.decode()
data = json.loads(body)
event_type = data.get('event_type', 'unknown')
event_data = data.get('data', {})
job_id = event_data.get('job_id')
if not job_id:
logger.warning("Received event without job_id, skipping", event_type=event_type)
await message.ack()
return
logger.info("Received training event from RabbitMQ",
job_id=job_id,
event_type=event_type,
progress=event_data.get('progress'))
# Map RabbitMQ event types to WebSocket message types
ws_message_type = _map_event_type(event_type)
# Create WebSocket message
ws_message = {
"type": ws_message_type,
"job_id": job_id,
"timestamp": data.get('timestamp'),
"data": event_data
}
# Broadcast to all WebSocket clients for this job
sent_count = await websocket_manager.broadcast(job_id, ws_message)
logger.info("Broadcasted event to WebSocket clients",
job_id=job_id,
event_type=event_type,
ws_message_type=ws_message_type,
clients_notified=sent_count)
# Always acknowledge the message to avoid infinite redelivery loops
# Progress events (started, progress, product_completed) are ephemeral and don't need redelivery
# Final events (completed, failed) should always be acknowledged
await message.ack()
except Exception as e:
logger.error("Error handling training event",
error=str(e),
exc_info=True)
# Always acknowledge even on error to avoid infinite redelivery loops
# The event is logged so we can debug issues
try:
await message.ack()
except:
pass # Message already gone or connection closed
def _map_event_type(rabbitmq_event_type: str) -> str:
"""Map RabbitMQ event types to WebSocket message types"""
mapping = {
"training.started": "started",
"training.progress": "progress",
"training.step.completed": "step_completed",
"training.product.completed": "product_completed",
"training.completed": "completed",
"training.failed": "failed",
}
return mapping.get(rabbitmq_event_type, "unknown")
async def setup_websocket_event_consumer() -> bool:
"""
Set up a global RabbitMQ consumer that listens to all training events
and broadcasts them to connected WebSocket clients.
"""
try:
# Ensure publisher is connected
if not training_publisher.connected:
logger.info("Connecting training publisher for WebSocket event consumer")
success = await training_publisher.connect()
if not success:
logger.error("Failed to connect training publisher")
return False
# Create a unique queue for WebSocket broadcasting
queue_name = "training_websocket_broadcast"
logger.info("Setting up WebSocket event consumer", queue_name=queue_name)
# Subscribe to all training events (routing key: training.#)
success = await training_publisher.consume_events(
exchange_name="training.events",
queue_name=queue_name,
routing_key="training.#", # Listen to all training events (multi-level)
callback=handle_training_event
)
if success:
logger.info("WebSocket event consumer set up successfully")
return True
else:
logger.error("Failed to set up WebSocket event consumer")
return False
except Exception as e:
logger.error("Error setting up WebSocket event consumer",
error=str(e),
exc_info=True)
return False
async def cleanup_websocket_consumers() -> None:
"""Clean up WebSocket event consumers"""
logger.info("Cleaning up WebSocket event consumers")
for task in _active_consumers:
if not task.done():
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
_active_consumers.clear()
logger.info("WebSocket event consumers cleaned up")

View File

@@ -0,0 +1,300 @@
"""
WebSocket Connection Manager for Training Service
Manages WebSocket connections and broadcasts RabbitMQ events to connected clients
HORIZONTAL SCALING:
- Uses Redis pub/sub for cross-pod WebSocket broadcasting
- Each pod subscribes to a Redis channel and broadcasts to its local connections
- Events published to Redis are received by all pods, ensuring clients on any
pod receive events from training jobs running on any other pod
"""
import asyncio
import json
import os
from typing import Dict, Optional
from fastapi import WebSocket
import structlog
logger = structlog.get_logger()
# Redis pub/sub channel for WebSocket events
REDIS_WEBSOCKET_CHANNEL = "training:websocket:events"
class WebSocketConnectionManager:
"""
WebSocket connection manager with Redis pub/sub for horizontal scaling.
In a multi-pod deployment:
1. Events are published to Redis pub/sub (not just local broadcast)
2. Each pod subscribes to Redis and broadcasts to its local WebSocket connections
3. This ensures clients connected to any pod receive events from any pod
Flow:
- RabbitMQ event → Pod A receives → Pod A publishes to Redis
- Redis pub/sub → All pods receive → Each pod broadcasts to local WebSockets
"""
def __init__(self):
# Structure: {job_id: {websocket_id: WebSocket}}
self._connections: Dict[str, Dict[int, WebSocket]] = {}
self._lock = asyncio.Lock()
# Store latest event for each job to provide initial state
self._latest_events: Dict[str, dict] = {}
# Redis client for pub/sub
self._redis: Optional[object] = None
self._pubsub: Optional[object] = None
self._subscriber_task: Optional[asyncio.Task] = None
self._running = False
self._instance_id = f"{os.environ.get('HOSTNAME', 'unknown')}:{os.getpid()}"
async def initialize_redis(self, redis_url: str) -> bool:
"""
Initialize Redis connection for cross-pod pub/sub.
Args:
redis_url: Redis connection URL
Returns:
True if successful, False otherwise
"""
try:
import redis.asyncio as redis_async
self._redis = redis_async.from_url(redis_url, decode_responses=True)
await self._redis.ping()
# Create pub/sub subscriber
self._pubsub = self._redis.pubsub()
await self._pubsub.subscribe(REDIS_WEBSOCKET_CHANNEL)
# Start subscriber task
self._running = True
self._subscriber_task = asyncio.create_task(self._redis_subscriber_loop())
logger.info("Redis pub/sub initialized for WebSocket broadcasting",
instance_id=self._instance_id,
channel=REDIS_WEBSOCKET_CHANNEL)
return True
except Exception as e:
logger.error("Failed to initialize Redis pub/sub",
error=str(e),
instance_id=self._instance_id)
return False
async def shutdown(self):
"""Shutdown Redis pub/sub connection"""
self._running = False
if self._subscriber_task:
self._subscriber_task.cancel()
try:
await self._subscriber_task
except asyncio.CancelledError:
pass
if self._pubsub:
await self._pubsub.unsubscribe(REDIS_WEBSOCKET_CHANNEL)
await self._pubsub.close()
if self._redis:
await self._redis.close()
logger.info("Redis pub/sub shutdown complete",
instance_id=self._instance_id)
async def _redis_subscriber_loop(self):
"""Background task to receive Redis pub/sub messages and broadcast locally"""
try:
while self._running:
try:
message = await self._pubsub.get_message(
ignore_subscribe_messages=True,
timeout=1.0
)
if message and message['type'] == 'message':
await self._handle_redis_message(message['data'])
except asyncio.CancelledError:
break
except Exception as e:
logger.error("Error in Redis subscriber loop",
error=str(e),
instance_id=self._instance_id)
await asyncio.sleep(1) # Backoff on error
except asyncio.CancelledError:
pass
logger.info("Redis subscriber loop stopped",
instance_id=self._instance_id)
async def _handle_redis_message(self, data: str):
"""Handle a message received from Redis pub/sub"""
try:
payload = json.loads(data)
job_id = payload.get('job_id')
message = payload.get('message')
source_instance = payload.get('source_instance')
if not job_id or not message:
return
# Log cross-pod message
if source_instance != self._instance_id:
logger.debug("Received cross-pod WebSocket event",
job_id=job_id,
source_instance=source_instance,
local_instance=self._instance_id)
# Broadcast to local WebSocket connections
await self._broadcast_local(job_id, message)
except json.JSONDecodeError as e:
logger.warning("Invalid JSON in Redis message", error=str(e))
except Exception as e:
logger.error("Error handling Redis message", error=str(e))
async def connect(self, job_id: str, websocket: WebSocket) -> None:
"""Register a new WebSocket connection for a job"""
await websocket.accept()
async with self._lock:
if job_id not in self._connections:
self._connections[job_id] = {}
ws_id = id(websocket)
self._connections[job_id][ws_id] = websocket
# Send initial state if available
if job_id in self._latest_events:
try:
await websocket.send_json({
"type": "initial_state",
"job_id": job_id,
"data": self._latest_events[job_id]
})
except Exception as e:
logger.warning("Failed to send initial state to new connection", error=str(e))
logger.info("WebSocket connected",
job_id=job_id,
websocket_id=ws_id,
total_connections=len(self._connections[job_id]),
instance_id=self._instance_id)
async def disconnect(self, job_id: str, websocket: WebSocket) -> None:
"""Remove a WebSocket connection"""
async with self._lock:
if job_id in self._connections:
ws_id = id(websocket)
self._connections[job_id].pop(ws_id, None)
# Clean up empty job connections
if not self._connections[job_id]:
del self._connections[job_id]
logger.info("WebSocket disconnected",
job_id=job_id,
websocket_id=ws_id,
remaining_connections=len(self._connections.get(job_id, {})),
instance_id=self._instance_id)
async def broadcast(self, job_id: str, message: dict) -> int:
"""
Broadcast a message to all connections for a specific job across ALL pods.
If Redis is configured, publishes to Redis pub/sub which then broadcasts
to all pods. Otherwise, falls back to local-only broadcast.
Returns the number of successful local broadcasts.
"""
# Store the latest event for this job to provide initial state to new connections
if message.get('type') != 'initial_state':
self._latest_events[job_id] = message
# If Redis is available, publish to Redis for cross-pod broadcast
if self._redis:
try:
payload = json.dumps({
'job_id': job_id,
'message': message,
'source_instance': self._instance_id
})
await self._redis.publish(REDIS_WEBSOCKET_CHANNEL, payload)
logger.debug("Published WebSocket event to Redis",
job_id=job_id,
message_type=message.get('type'),
instance_id=self._instance_id)
# Return 0 here because the actual broadcast happens via subscriber
# The count will be from _broadcast_local when the message is received
return 0
except Exception as e:
logger.warning("Failed to publish to Redis, falling back to local broadcast",
error=str(e),
job_id=job_id)
# Fall through to local broadcast
# Local-only broadcast (when Redis is not available)
return await self._broadcast_local(job_id, message)
async def _broadcast_local(self, job_id: str, message: dict) -> int:
"""
Broadcast a message to local WebSocket connections only.
This is called either directly (no Redis) or from Redis subscriber.
"""
if job_id not in self._connections:
logger.debug("No active local connections for job",
job_id=job_id,
instance_id=self._instance_id)
return 0
connections = list(self._connections[job_id].values())
successful_sends = 0
failed_websockets = []
for websocket in connections:
try:
await websocket.send_json(message)
successful_sends += 1
except Exception as e:
logger.warning("Failed to send message to WebSocket",
job_id=job_id,
error=str(e))
failed_websockets.append(websocket)
# Clean up failed connections
if failed_websockets:
async with self._lock:
for ws in failed_websockets:
ws_id = id(ws)
self._connections[job_id].pop(ws_id, None)
if successful_sends > 0:
logger.info("Broadcasted message to local WebSocket clients",
job_id=job_id,
message_type=message.get('type'),
successful_sends=successful_sends,
failed_sends=len(failed_websockets),
instance_id=self._instance_id)
return successful_sends
def get_connection_count(self, job_id: str) -> int:
"""Get the number of active local connections for a job"""
return len(self._connections.get(job_id, {}))
def get_total_connection_count(self) -> int:
"""Get total number of active connections across all jobs"""
return sum(len(conns) for conns in self._connections.values())
def is_redis_enabled(self) -> bool:
"""Check if Redis pub/sub is enabled"""
return self._redis is not None and self._running
# Global singleton instance
websocket_manager = WebSocketConnectionManager()