REFACTOR ALL APIs fix 1

This commit is contained in:
Urtzi Alfaro
2025-10-07 07:15:07 +02:00
parent 38fb98bc27
commit 7c72f83c51
47 changed files with 1821 additions and 270 deletions

View File

@@ -21,7 +21,7 @@ from app.middleware.logging import LoggingMiddleware
from app.middleware.rate_limit import RateLimitMiddleware
from app.middleware.subscription import SubscriptionMiddleware
from app.middleware.demo_middleware import DemoMiddleware
from app.routes import auth, tenant, notification, nominatim, user, subscription, demo
from app.routes import auth, tenant, notification, nominatim, user, subscription, demo, pos
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
@@ -71,6 +71,7 @@ app.include_router(tenant.router, prefix="/api/v1/tenants", tags=["tenants"])
app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"])
app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
@@ -251,203 +252,106 @@ async def events_stream(request: Request, tenant_id: str):
# WEBSOCKET ROUTING FOR TRAINING SERVICE
# ================================================================
@app.websocket("/api/v1/ws/tenants/{tenant_id}/training/jobs/{job_id}/live")
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
"""WebSocket proxy that forwards connections directly to training service with enhanced token validation"""
await websocket.accept()
# Get token from query params
"""
WebSocket proxy that forwards connections directly to training service.
Acts as a pure proxy - does NOT handle websocket logic, just forwards to training service.
All auth, message handling, and business logic is in the training service.
"""
# Get token from query params (required for training service authentication)
token = websocket.query_params.get("token")
if not token:
logger.warning(f"WebSocket connection rejected - missing token for job {job_id}")
logger.warning(f"WebSocket proxy rejected - missing token for job {job_id}")
await websocket.accept()
await websocket.close(code=1008, reason="Authentication token required")
return
# Validate token using auth middleware
from app.middleware.auth import jwt_handler
try:
payload = jwt_handler.verify_token(token)
if not payload:
logger.warning(f"WebSocket connection rejected - invalid token for job {job_id}")
await websocket.close(code=1008, reason="Invalid authentication token")
return
# Accept the connection immediately
await websocket.accept()
# Check token expiration
import time
if payload.get('exp', 0) < time.time():
logger.warning(f"WebSocket connection rejected - expired token for job {job_id}")
await websocket.close(code=1008, reason="Token expired")
return
logger.info(f"Gateway proxying WebSocket to training service for job {job_id}, tenant {tenant_id}")
logger.info(f"WebSocket token validated for user {payload.get('email', 'unknown')}")
except Exception as e:
logger.warning(f"WebSocket token validation failed for job {job_id}: {e}")
await websocket.close(code=1008, reason="Token validation failed")
return
logger.info(f"Proxying WebSocket connection to training service for job {job_id}, tenant {tenant_id}")
# Build WebSocket URL to training service
# Build WebSocket URL to training service - forward to the exact same path
training_service_base = settings.TRAINING_SERVICE_URL.rstrip('/')
training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
training_ws = None
heartbeat_task = None
try:
# Connect to training service WebSocket with proper timeout configuration
# Connect to training service WebSocket
import websockets
# Configure timeouts to coordinate with frontend (30s heartbeat) and training service
# DISABLE gateway-level ping to avoid dual-ping conflicts - let frontend handle ping/pong
training_ws = await websockets.connect(
training_ws_url,
ping_interval=None, # DISABLED: Let frontend handle ping/pong via message forwarding
ping_timeout=None, # DISABLED: No independent ping mechanism
close_timeout=15, # Reasonable close timeout
max_size=2**20, # 1MB max message size
max_queue=32 # Max queued messages
ping_interval=None, # Let training service handle heartbeat
ping_timeout=None,
close_timeout=10,
open_timeout=30, # Allow time for training service to setup
max_size=2**20,
max_queue=32
)
logger.info(f"Connected to training service WebSocket for job {job_id} with gateway ping DISABLED (frontend handles ping/pong)")
# Track connection state properly due to FastAPI WebSocket state propagation bug
connection_alive = True
last_activity = asyncio.get_event_loop().time()
async def check_connection_health():
"""Monitor connection health based on activity timestamps only - no WebSocket interference"""
nonlocal connection_alive, last_activity
while connection_alive:
try:
await asyncio.sleep(30) # Check every 30 seconds (aligned with frontend heartbeat)
current_time = asyncio.get_event_loop().time()
# Check if we haven't received any activity for too long
# Frontend sends ping every 30s, so 90s = 3 missed pings before considering dead
if current_time - last_activity > 90:
logger.warning(f"No frontend activity for 90s on job {job_id} - connection may be dead")
# Don't forcibly close - let the forwarding loops handle actual connection issues
# This is just monitoring/logging now
else:
logger.debug(f"Connection health OK for job {job_id} - last activity {int(current_time - last_activity)}s ago")
except Exception as e:
logger.error(f"Connection health monitoring error for job {job_id}: {e}")
break
logger.info(f"Gateway connected to training service WebSocket for job {job_id}")
async def forward_to_training():
"""Forward messages from frontend to training service with proper error handling"""
nonlocal connection_alive, last_activity
"""Forward messages from frontend to training service"""
try:
while connection_alive and training_ws and training_ws.open:
try:
# Use longer timeout to avoid conflicts with frontend 30s heartbeat
# Frontend sends ping every 30s, so we need to allow for some latency
data = await asyncio.wait_for(websocket.receive(), timeout=45.0)
last_activity = asyncio.get_event_loop().time()
while training_ws and training_ws.open:
data = await websocket.receive()
# Handle different message types
if data.get("type") == "websocket.receive":
if "text" in data:
message = data["text"]
# Forward text messages to training service
await training_ws.send(message)
logger.debug(f"Forwarded message to training service for job {job_id}: {message[:100]}...")
elif "bytes" in data:
# Forward binary messages if needed
await training_ws.send(data["bytes"])
# Ping/pong frames are automatically handled by Starlette/FastAPI
except asyncio.TimeoutError:
# No message received in 45 seconds, continue loop
# This allows for frontend 30s heartbeat + network latency + processing time
continue
except Exception as e:
logger.error(f"Error receiving from frontend for job {job_id}: {e}")
connection_alive = False
if data.get("type") == "websocket.receive":
if "text" in data:
await training_ws.send(data["text"])
logger.debug(f"Gateway forwarded frontend->training: {data['text'][:100]}")
elif "bytes" in data:
await training_ws.send(data["bytes"])
elif data.get("type") == "websocket.disconnect":
logger.info(f"Frontend disconnected for job {job_id}")
break
except Exception as e:
logger.error(f"Error in forward_to_training for job {job_id}: {e}")
connection_alive = False
logger.error(f"Error forwarding frontend->training for job {job_id}: {e}")
async def forward_to_frontend():
"""Forward messages from training service to frontend with proper error handling"""
nonlocal connection_alive, last_activity
"""Forward messages from training service to frontend"""
try:
while connection_alive and training_ws and training_ws.open:
try:
# Use coordinated timeout - training service expects messages every 60s
# This should be longer than training service timeout to avoid premature closure
message = await asyncio.wait_for(training_ws.recv(), timeout=75.0)
last_activity = asyncio.get_event_loop().time()
# Forward the message to frontend
await websocket.send_text(message)
logger.debug(f"Forwarded message to frontend for job {job_id}: {message[:100]}...")
except asyncio.TimeoutError:
# No message received in 75 seconds, continue loop
# Training service sends heartbeats, so this indicates potential issues
continue
except Exception as e:
logger.error(f"Error receiving from training service for job {job_id}: {e}")
connection_alive = False
break
while training_ws and training_ws.open:
message = await training_ws.recv()
await websocket.send_text(message)
logger.debug(f"Gateway forwarded training->frontend: {message[:100]}")
except Exception as e:
logger.error(f"Error in forward_to_frontend for job {job_id}: {e}")
connection_alive = False
logger.error(f"Error forwarding training->frontend for job {job_id}: {e}")
# Start connection health monitoring
heartbeat_task = asyncio.create_task(check_connection_health())
# Run both forwarding tasks concurrently with proper error handling
try:
await asyncio.gather(
forward_to_training(),
forward_to_frontend(),
return_exceptions=True
)
except Exception as e:
logger.error(f"Error in WebSocket forwarding tasks for job {job_id}: {e}")
finally:
connection_alive = False
# Run both forwarding tasks concurrently
await asyncio.gather(
forward_to_training(),
forward_to_frontend(),
return_exceptions=True
)
except websockets.exceptions.ConnectionClosedError as e:
logger.warning(f"Training service WebSocket connection closed for job {job_id}: {e}")
logger.warning(f"Training service WebSocket closed for job {job_id}: {e}")
except websockets.exceptions.WebSocketException as e:
logger.error(f"WebSocket exception for job {job_id}: {e}")
except Exception as e:
logger.error(f"WebSocket proxy error for job {job_id}: {e}")
finally:
# Cleanup
if heartbeat_task and not heartbeat_task.done():
heartbeat_task.cancel()
try:
await heartbeat_task
except asyncio.CancelledError:
pass
if training_ws and not training_ws.closed:
try:
await training_ws.close()
logger.info(f"Closed training service WebSocket for job {job_id}")
except Exception as e:
logger.warning(f"Error closing training service WebSocket for job {job_id}: {e}")
try:
if not websocket.client_state.name == 'DISCONNECTED':
await websocket.close(code=1000, reason="Proxy connection closed")
await websocket.close(code=1000, reason="Proxy closed")
except Exception as e:
logger.warning(f"Error closing frontend WebSocket for job {job_id}: {e}")
logger.info(f"WebSocket proxy cleanup completed for job {job_id}")
logger.info(f"Gateway WebSocket proxy cleanup completed for job {job_id}")
if __name__ == "__main__":
import uvicorn