demo seed change

This commit is contained in:
Urtzi Alfaro
2025-12-13 23:57:54 +01:00
parent f3688dfb04
commit ff830a3415
299 changed files with 20328 additions and 19485 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -25,10 +25,17 @@ route_builder = RouteBuilder('demo')
async def _background_cloning_task(session_id: str, session_obj_id: UUID, base_tenant_id: str):
"""Background task for orchestrated cloning - creates its own DB session"""
from app.core.database import db_manager
from app.models import DemoSession
from sqlalchemy import select
from app.models import DemoSession, DemoSessionStatus
from sqlalchemy import select, update
from app.core.redis_wrapper import get_redis
logger.info(
"Starting background cloning task",
session_id=session_id,
session_obj_id=str(session_obj_id),
base_tenant_id=base_tenant_id
)
# Create new database session for background task
async with db_manager.session_factory() as db:
try:
@@ -43,8 +50,30 @@ async def _background_cloning_task(session_id: str, session_obj_id: UUID, base_t
if not session:
logger.error("Session not found for cloning", session_id=session_id)
# Mark session as failed in Redis for frontend polling
try:
client = await redis.get_client()
status_key = f"session:{session_id}:status"
import json
status_data = {
"session_id": session_id,
"status": "failed",
"error": "Session not found in database",
"progress": {},
"total_records_cloned": 0
}
await client.setex(status_key, 7200, json.dumps(status_data))
except Exception as redis_error:
logger.error("Failed to update Redis status for missing session", error=str(redis_error))
return
logger.info(
"Found session for cloning",
session_id=session_id,
current_status=session.status.value,
demo_account_type=session.demo_account_type
)
# Create session manager with new DB session
session_manager = DemoSessionManager(db, redis)
await session_manager.trigger_orchestrated_cloning(session, base_tenant_id)
@@ -58,25 +87,40 @@ async def _background_cloning_task(session_id: str, session_obj_id: UUID, base_t
)
# Attempt to update session status to failed if possible
try:
from app.core.database import db_manager
from app.models import DemoSession
from sqlalchemy import select, update
# Try to update the session directly in DB to mark it as failed
async with db_manager.session_factory() as update_db:
from app.models import DemoSessionStatus
update_result = await update_db.execute(
update(DemoSession)
.where(DemoSession.id == session_obj_id)
.values(status=DemoSessionStatus.FAILED, cloning_completed_at=datetime.now(timezone.utc))
)
await update_db.commit()
logger.info("Successfully updated session status to FAILED in database")
except Exception as update_error:
logger.error(
"Failed to update session status to FAILED after background task error",
session_id=session_id,
error=str(update_error)
)
# Also update Redis status for frontend polling
try:
client = await redis.get_client()
status_key = f"session:{session_id}:status"
import json
status_data = {
"session_id": session_id,
"status": "failed",
"error": str(e),
"progress": {},
"total_records_cloned": 0,
"cloning_completed_at": datetime.now(timezone.utc).isoformat()
}
await client.setex(status_key, 7200, json.dumps(status_data))
logger.info("Successfully updated Redis status to FAILED")
except Exception as redis_error:
logger.error("Failed to update Redis status after background task error", error=str(redis_error))
def _handle_task_result(task, session_id: str):
@@ -91,6 +135,36 @@ def _handle_task_result(task, session_id: str):
error=str(e),
exc_info=True
)
# Try to update Redis status to reflect the failure
try:
from app.core.redis_wrapper import get_redis
import json
async def update_redis_status():
redis = await get_redis()
client = await redis.get_client()
status_key = f"session:{session_id}:status"
status_data = {
"session_id": session_id,
"status": "failed",
"error": f"Task exception: {str(e)}",
"progress": {},
"total_records_cloned": 0,
"cloning_completed_at": datetime.now(timezone.utc).isoformat()
}
await client.setex(status_key, 7200, json.dumps(status_data))
# Run the async function
import asyncio
asyncio.run(update_redis_status())
except Exception as redis_error:
logger.error(
"Failed to update Redis status in task result handler",
session_id=session_id,
error=str(redis_error)
)
@router.post(
@@ -209,6 +283,123 @@ async def get_session_status(
return status
@router.get(
route_builder.build_resource_detail_route("sessions", "session_id", include_tenant_prefix=False) + "/errors",
response_model=dict
)
async def get_session_errors(
session_id: str = Path(...),
db: AsyncSession = Depends(get_db),
redis: DemoRedisWrapper = Depends(get_redis)
):
"""
Get detailed error information for a failed demo session
Returns comprehensive error details including:
- Failed services and their specific errors
- Network connectivity issues
- Timeout problems
- Service-specific error messages
"""
try:
# Try to get the session first
session_manager = DemoSessionManager(db, redis)
session = await session_manager.get_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
# Check if session has failed status
if session.status != DemoSessionStatus.FAILED:
return {
"session_id": session_id,
"status": session.status.value,
"has_errors": False,
"message": "Session has not failed - no error details available"
}
# Get detailed error information from cloning progress
error_details = []
failed_services = []
if session.cloning_progress:
for service_name, service_data in session.cloning_progress.items():
if isinstance(service_data, dict) and service_data.get("status") == "failed":
failed_services.append(service_name)
error_details.append({
"service": service_name,
"error": service_data.get("error", "Unknown error"),
"response_status": service_data.get("response_status"),
"response_text": service_data.get("response_text", ""),
"duration_ms": service_data.get("duration_ms", 0)
})
# Check Redis for additional error information
client = await redis.get_client()
error_key = f"session:{session_id}:errors"
redis_errors = await client.get(error_key)
if redis_errors:
import json
try:
additional_errors = json.loads(redis_errors)
if isinstance(additional_errors, list):
error_details.extend(additional_errors)
elif isinstance(additional_errors, dict):
error_details.append(additional_errors)
except json.JSONDecodeError:
logger.warning("Failed to parse Redis error data", session_id=session_id)
# Create comprehensive error report
error_report = {
"session_id": session_id,
"status": session.status.value,
"has_errors": True,
"failed_services": failed_services,
"error_count": len(error_details),
"errors": error_details,
"cloning_started_at": session.cloning_started_at.isoformat() if session.cloning_started_at else None,
"cloning_completed_at": session.cloning_completed_at.isoformat() if session.cloning_completed_at else None,
"total_records_cloned": session.total_records_cloned,
"demo_account_type": session.demo_account_type
}
# Add troubleshooting suggestions
suggestions = []
if "tenant" in failed_services:
suggestions.append("Check if tenant service is running and accessible")
suggestions.append("Verify base tenant ID configuration")
if "auth" in failed_services:
suggestions.append("Check if auth service is running and accessible")
suggestions.append("Verify seed data files for auth service")
if any(svc in failed_services for svc in ["inventory", "recipes", "suppliers", "production"]):
suggestions.append("Check if the specific service is running and accessible")
suggestions.append("Verify seed data files exist and are valid")
if any("timeout" in error.get("error", "").lower() for error in error_details):
suggestions.append("Check service response times and consider increasing timeouts")
suggestions.append("Verify network connectivity between services")
if any("network" in error.get("error", "").lower() for error in error_details):
suggestions.append("Check network connectivity between demo-session and other services")
suggestions.append("Verify DNS resolution and service discovery")
if suggestions:
error_report["troubleshooting_suggestions"] = suggestions
return error_report
except Exception as e:
logger.error(
"Failed to retrieve session errors",
session_id=session_id,
error=str(e),
exc_info=True
)
raise HTTPException(
status_code=500,
detail=f"Failed to retrieve error details: {str(e)}"
)
@router.post(
route_builder.build_resource_detail_route("sessions", "session_id", include_tenant_prefix=False) + "/retry",
response_model=dict

View File

@@ -9,7 +9,7 @@ import structlog
from app.core import get_db, settings
from app.core.redis_wrapper import get_redis, DemoRedisWrapper
from app.services.data_cloner import DemoDataCloner
from app.services.cleanup_service import DemoCleanupService
logger = structlog.get_logger()
router = APIRouter()
@@ -41,24 +41,31 @@ async def cleanup_demo_session_internal(
if not all([tenant_id, session_id]):
raise HTTPException(
status_code=400,
status_code=400,
detail="Missing required parameters: tenant_id, session_id"
)
logger.info(
"Internal cleanup requested",
"Internal cleanup requested",
tenant_id=tenant_id,
session_id=session_id
)
data_cloner = DemoDataCloner(db, redis)
cleanup_service = DemoCleanupService(db, redis)
# Validate required fields
if not tenant_id or not session_id:
raise ValueError("tenant_id and session_id are required")
# Delete session data for this tenant
await data_cloner.delete_session_data(
str(tenant_id),
session_id
await cleanup_service._delete_tenant_data(
tenant_id=str(tenant_id),
session_id=str(session_id)
)
# Delete Redis data
await redis.delete_session_data(str(session_id))
logger.info(
"Internal cleanup completed",
tenant_id=tenant_id,
@@ -73,7 +80,7 @@ async def cleanup_demo_session_internal(
except Exception as e:
logger.error(
"Internal cleanup failed",
"Internal cleanup failed",
error=str(e),
tenant_id=cleanup_request.get('tenant_id'),
session_id=cleanup_request.get('session_id'),

View File

@@ -48,23 +48,23 @@ class Settings(BaseServiceSettings):
"email": "demo.enterprise@panaderiacentral.com",
"name": "Panadería Central - Demo Enterprise",
"subdomain": "demo-central",
"base_tenant_id": "c3d4e5f6-a7b8-49c0-d1e2-f3a4b5c6d7e8",
"base_tenant_id": "80000000-0000-4000-a000-000000000001",
"subscription_tier": "enterprise",
"tenant_type": "parent",
"children": [
{
"name": "Madrid Centro",
"base_tenant_id": "d4e5f6a7-b8c9-40d1-e2f3-a4b5c6d7e8f9",
"base_tenant_id": "A0000000-0000-4000-a000-000000000001",
"location": {"city": "Madrid", "zone": "Centro", "latitude": 40.4168, "longitude": -3.7038}
},
{
"name": "Barcelona Gràcia",
"base_tenant_id": "e5f6a7b8-c9d0-41e2-f3a4-b5c6d7e8f9a0",
"base_tenant_id": "B0000000-0000-4000-a000-000000000001",
"location": {"city": "Barcelona", "zone": "Gràcia", "latitude": 41.4036, "longitude": 2.1561}
},
{
"name": "Valencia Ruzafa",
"base_tenant_id": "f6a7b8c9-d0e1-42f3-a4b5-c6d7e8f9a0b1",
"base_tenant_id": "C0000000-0000-4000-a000-000000000001",
"location": {"city": "Valencia", "zone": "Ruzafa", "latitude": 39.4623, "longitude": -0.3645}
}
]

View File

@@ -0,0 +1,85 @@
"""
Prometheus metrics for demo session service
"""
from prometheus_client import Counter, Histogram, Gauge
# Counters
demo_sessions_created_total = Counter(
'demo_sessions_created_total',
'Total number of demo sessions created',
['tier', 'status']
)
demo_sessions_deleted_total = Counter(
'demo_sessions_deleted_total',
'Total number of demo sessions deleted',
['tier', 'status']
)
demo_cloning_errors_total = Counter(
'demo_cloning_errors_total',
'Total number of cloning errors',
['tier', 'service', 'error_type']
)
# Histograms (for latency percentiles)
demo_session_creation_duration_seconds = Histogram(
'demo_session_creation_duration_seconds',
'Duration of demo session creation',
['tier'],
buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60]
)
demo_service_clone_duration_seconds = Histogram(
'demo_service_clone_duration_seconds',
'Duration of individual service cloning',
['tier', 'service'],
buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50]
)
demo_session_cleanup_duration_seconds = Histogram(
'demo_session_cleanup_duration_seconds',
'Duration of demo session cleanup',
['tier'],
buckets=[0.5, 1, 2, 5, 10, 15, 20, 30]
)
# Gauges
demo_sessions_active = Gauge(
'demo_sessions_active',
'Number of currently active demo sessions',
['tier']
)
demo_sessions_pending_cleanup = Gauge(
'demo_sessions_pending_cleanup',
'Number of demo sessions pending cleanup'
)
# Alert generation metrics
demo_alerts_generated_total = Counter(
'demo_alerts_generated_total',
'Total number of alerts generated post-clone',
['tier', 'alert_type']
)
demo_ai_insights_generated_total = Counter(
'demo_ai_insights_generated_total',
'Total number of AI insights generated post-clone',
['tier', 'insight_type']
)
# Cross-service metrics
demo_cross_service_calls_total = Counter(
'demo_cross_service_calls_total',
'Total number of cross-service API calls during cloning',
['source_service', 'target_service', 'status']
)
demo_cross_service_call_duration_seconds = Histogram(
'demo_cross_service_call_duration_seconds',
'Duration of cross-service API calls during cloning',
['source_service', 'target_service'],
buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30]
)

View File

@@ -1,7 +1,9 @@
"""Demo Session Services"""
from .session_manager import DemoSessionManager
from .data_cloner import DemoDataCloner
from .cleanup_service import DemoCleanupService
__all__ = ["DemoSessionManager", "DemoDataCloner", "DemoCleanupService"]
__all__ = [
"DemoSessionManager",
"DemoCleanupService",
]

View File

@@ -4,14 +4,21 @@ Handles automatic cleanup of expired sessions
"""
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, update
from datetime import datetime, timezone
from typing import List
from sqlalchemy import select
from datetime import datetime, timezone, timedelta
import structlog
import httpx
import asyncio
import os
from app.models import DemoSession, DemoSessionStatus
from app.services.data_cloner import DemoDataCloner
from datetime import datetime, timezone, timedelta
from app.core.redis_wrapper import DemoRedisWrapper
from app.monitoring.metrics import (
demo_sessions_deleted_total,
demo_session_cleanup_duration_seconds,
demo_sessions_active
)
logger = structlog.get_logger()
@@ -22,7 +29,199 @@ class DemoCleanupService:
def __init__(self, db: AsyncSession, redis: DemoRedisWrapper):
self.db = db
self.redis = redis
self.data_cloner = DemoDataCloner(db, redis)
from app.core.config import settings
self.internal_api_key = settings.INTERNAL_API_KEY
# Service URLs for cleanup
self.services = [
("tenant", os.getenv("TENANT_SERVICE_URL", "http://tenant-service:8000")),
("auth", os.getenv("AUTH_SERVICE_URL", "http://auth-service:8000")),
("inventory", os.getenv("INVENTORY_SERVICE_URL", "http://inventory-service:8000")),
("recipes", os.getenv("RECIPES_SERVICE_URL", "http://recipes-service:8000")),
("suppliers", os.getenv("SUPPLIERS_SERVICE_URL", "http://suppliers-service:8000")),
("production", os.getenv("PRODUCTION_SERVICE_URL", "http://production-service:8000")),
("procurement", os.getenv("PROCUREMENT_SERVICE_URL", "http://procurement-service:8000")),
("sales", os.getenv("SALES_SERVICE_URL", "http://sales-service:8000")),
("orders", os.getenv("ORDERS_SERVICE_URL", "http://orders-service:8000")),
("forecasting", os.getenv("FORECASTING_SERVICE_URL", "http://forecasting-service:8000")),
("orchestrator", os.getenv("ORCHESTRATOR_SERVICE_URL", "http://orchestrator-service:8000")),
]
async def cleanup_session(self, session: DemoSession) -> dict:
"""
Delete all data for a demo session across all services.
Returns:
{
"success": bool,
"total_deleted": int,
"duration_ms": int,
"details": {service: {records_deleted, duration_ms}},
"errors": []
}
"""
start_time = datetime.now(timezone.utc)
virtual_tenant_id = str(session.virtual_tenant_id)
session_id = session.session_id
logger.info(
"Starting demo session cleanup",
session_id=session_id,
virtual_tenant_id=virtual_tenant_id,
demo_account_type=session.demo_account_type
)
# Delete from all services in parallel
tasks = [
self._delete_from_service(name, url, virtual_tenant_id)
for name, url in self.services
]
service_results = await asyncio.gather(*tasks, return_exceptions=True)
# Aggregate results
total_deleted = 0
details = {}
errors = []
for (service_name, _), result in zip(self.services, service_results):
if isinstance(result, Exception):
errors.append(f"{service_name}: {str(result)}")
details[service_name] = {"status": "error", "error": str(result)}
else:
total_deleted += result.get("records_deleted", {}).get("total", 0)
details[service_name] = result
# Delete from Redis
await self._delete_redis_cache(virtual_tenant_id)
# Delete child tenants if enterprise
if session.demo_account_type == "enterprise":
child_metadata = session.session_metadata.get("children", [])
for child in child_metadata:
child_tenant_id = child["virtual_tenant_id"]
await self._delete_from_all_services(child_tenant_id)
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
success = len(errors) == 0
logger.info(
"Demo session cleanup completed",
session_id=session_id,
virtual_tenant_id=virtual_tenant_id,
success=success,
total_deleted=total_deleted,
duration_ms=duration_ms,
error_count=len(errors)
)
return {
"success": success,
"total_deleted": total_deleted,
"duration_ms": duration_ms,
"details": details,
"errors": errors
}
async def _delete_from_service(
self,
service_name: str,
service_url: str,
virtual_tenant_id: str
) -> dict:
"""Delete all data from a single service"""
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.delete(
f"{service_url}/internal/demo/tenant/{virtual_tenant_id}",
headers={"X-Internal-API-Key": self.internal_api_key}
)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
# Already deleted or never existed - idempotent
return {
"service": service_name,
"status": "not_found",
"records_deleted": {"total": 0}
}
else:
raise Exception(f"HTTP {response.status_code}: {response.text}")
except Exception as e:
logger.error(
"Failed to delete from service",
service=service_name,
virtual_tenant_id=virtual_tenant_id,
error=str(e)
)
raise
async def _delete_redis_cache(self, virtual_tenant_id: str):
"""Delete all Redis keys for a virtual tenant"""
try:
client = await self.redis.get_client()
pattern = f"*:{virtual_tenant_id}:*"
keys = await client.keys(pattern)
if keys:
await client.delete(*keys)
logger.debug("Deleted Redis cache", tenant_id=virtual_tenant_id, keys_deleted=len(keys))
except Exception as e:
logger.warning("Failed to delete Redis cache", error=str(e), tenant_id=virtual_tenant_id)
async def _delete_from_all_services(self, virtual_tenant_id: str):
"""Delete data from all services for a tenant"""
tasks = [
self._delete_from_service(name, url, virtual_tenant_id)
for name, url in self.services
]
return await asyncio.gather(*tasks, return_exceptions=True)
async def _delete_tenant_data(self, tenant_id: str, session_id: str) -> dict:
"""Delete demo data for a tenant across all services"""
logger.info("Deleting tenant data", tenant_id=tenant_id, session_id=session_id)
results = {}
async def delete_from_service(service_name: str, service_url: str):
try:
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.delete(
f"{service_url}/internal/demo/tenant/{tenant_id}",
headers={"X-Internal-API-Key": self.internal_api_key}
)
if response.status_code == 200:
logger.debug(f"Deleted data from {service_name}", tenant_id=tenant_id)
return {"service": service_name, "status": "deleted"}
else:
logger.warning(
f"Failed to delete from {service_name}",
status_code=response.status_code,
tenant_id=tenant_id
)
return {"service": service_name, "status": "failed", "error": f"HTTP {response.status_code}"}
except Exception as e:
logger.warning(
f"Exception deleting from {service_name}",
error=str(e),
tenant_id=tenant_id
)
return {"service": service_name, "status": "failed", "error": str(e)}
# Delete from all services in parallel
tasks = [delete_from_service(name, url) for name, url in self.services]
service_results = await asyncio.gather(*tasks, return_exceptions=True)
for result in service_results:
if isinstance(result, Exception):
logger.error("Service deletion failed", error=str(result))
elif isinstance(result, dict):
results[result["service"]] = result
return results
async def cleanup_expired_sessions(self) -> dict:
"""
@@ -32,9 +231,9 @@ class DemoCleanupService:
Returns:
Cleanup statistics
"""
from datetime import timedelta
logger.info("Starting demo session cleanup")
start_time = datetime.now(timezone.utc)
now = datetime.now(timezone.utc)
stuck_threshold = now - timedelta(minutes=5) # Sessions pending > 5 min are stuck
@@ -97,10 +296,7 @@ class DemoCleanupService:
)
for child_id in child_tenant_ids:
try:
await self.data_cloner.delete_session_data(
str(child_id),
session.session_id
)
await self._delete_tenant_data(child_id, session.session_id)
except Exception as child_error:
logger.error(
"Failed to delete child tenant",
@@ -109,11 +305,14 @@ class DemoCleanupService:
)
# Delete parent/main session data
await self.data_cloner.delete_session_data(
await self._delete_tenant_data(
str(session.virtual_tenant_id),
session.session_id
)
# Delete Redis data
await self.redis.delete_session_data(session.session_id)
stats["cleaned_up"] += 1
logger.info(
@@ -137,6 +336,19 @@ class DemoCleanupService:
)
logger.info("Demo session cleanup completed", stats=stats)
# Update Prometheus metrics
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
demo_session_cleanup_duration_seconds.labels(tier="all").observe(duration_ms / 1000)
# Update deleted sessions metrics by tier (we need to determine tiers from sessions)
for session in all_sessions_to_cleanup:
demo_sessions_deleted_total.labels(
tier=session.demo_account_type,
status="success"
).inc()
demo_sessions_active.labels(tier=session.demo_account_type).dec()
return stats
async def cleanup_old_destroyed_sessions(self, days: int = 7) -> int:
@@ -149,8 +361,6 @@ class DemoCleanupService:
Returns:
Number of deleted records
"""
from datetime import timedelta
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days)
result = await self.db.execute(

File diff suppressed because it is too large Load Diff

View File

@@ -1,604 +0,0 @@
"""
Cloning Strategy Pattern Implementation
Provides explicit, type-safe strategies for different demo account types
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, Any, List, Optional
from datetime import datetime, timezone
import structlog
logger = structlog.get_logger()
@dataclass
class CloningContext:
"""
Context object containing all data needed for cloning operations
Immutable to prevent state mutation bugs
"""
base_tenant_id: str
virtual_tenant_id: str
session_id: str
demo_account_type: str
session_metadata: Optional[Dict[str, Any]] = None
services_filter: Optional[List[str]] = None
# Orchestrator dependencies (injected)
orchestrator: Any = None # Will be CloneOrchestrator instance
def __post_init__(self):
"""Validate context after initialization"""
if not self.base_tenant_id:
raise ValueError("base_tenant_id is required")
if not self.virtual_tenant_id:
raise ValueError("virtual_tenant_id is required")
if not self.session_id:
raise ValueError("session_id is required")
class CloningStrategy(ABC):
"""
Abstract base class for cloning strategies
Each strategy is a leaf node - no recursion possible
"""
@abstractmethod
async def clone(self, context: CloningContext) -> Dict[str, Any]:
"""
Execute the cloning strategy
Args:
context: Immutable context with all required data
Returns:
Dictionary with cloning results
"""
pass
@abstractmethod
def get_strategy_name(self) -> str:
"""Return the name of this strategy for logging"""
pass
class ProfessionalCloningStrategy(CloningStrategy):
"""
Strategy for single-tenant professional demos
Clones all services for a single virtual tenant
"""
def get_strategy_name(self) -> str:
return "professional"
async def clone(self, context: CloningContext) -> Dict[str, Any]:
"""
Clone demo data for a professional (single-tenant) account
Process:
1. Validate context
2. Clone all services in parallel
3. Handle failures with partial success support
4. Return aggregated results
"""
logger.info(
"Executing professional cloning strategy",
session_id=context.session_id,
virtual_tenant_id=context.virtual_tenant_id,
base_tenant_id=context.base_tenant_id
)
start_time = datetime.now(timezone.utc)
# Determine which services to clone
services_to_clone = context.orchestrator.services
if context.services_filter:
services_to_clone = [
s for s in context.orchestrator.services
if s.name in context.services_filter
]
logger.info(
"Filtering services",
session_id=context.session_id,
services_filter=context.services_filter,
filtered_count=len(services_to_clone)
)
# Rollback stack for cleanup
rollback_stack = []
try:
# Import asyncio here to avoid circular imports
import asyncio
# Create parallel tasks for all services
tasks = []
service_map = {}
for service_def in services_to_clone:
task = asyncio.create_task(
context.orchestrator._clone_service(
service_def=service_def,
base_tenant_id=context.base_tenant_id,
virtual_tenant_id=context.virtual_tenant_id,
demo_account_type=context.demo_account_type,
session_id=context.session_id,
session_metadata=context.session_metadata
)
)
tasks.append(task)
service_map[task] = service_def.name
# Process tasks as they complete for real-time progress updates
service_results = {}
total_records = 0
failed_services = []
required_service_failed = False
completed_count = 0
total_count = len(tasks)
# Create a mapping from futures to service names to properly identify completed tasks
# We'll use asyncio.wait approach instead of as_completed to access the original tasks
pending = set(tasks)
completed_tasks_info = {task: service_map[task] for task in tasks} # Map tasks to service names
while pending:
# Wait for at least one task to complete
done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
# Process each completed task
for completed_task in done:
try:
# Get the result from the completed task
result = await completed_task
# Get the service name from our mapping
service_name = completed_tasks_info[completed_task]
service_def = next(s for s in services_to_clone if s.name == service_name)
service_results[service_name] = result
completed_count += 1
if result.get("status") == "failed":
failed_services.append(service_name)
if service_def.required:
required_service_failed = True
else:
total_records += result.get("records_cloned", 0)
# Track successful services for rollback
if result.get("status") == "completed":
rollback_stack.append({
"type": "service",
"service_name": service_name,
"tenant_id": context.virtual_tenant_id,
"session_id": context.session_id
})
# Update Redis with granular progress after each service completes
await context.orchestrator._update_progress_in_redis(context.session_id, {
"completed_services": completed_count,
"total_services": total_count,
"progress_percentage": int((completed_count / total_count) * 100),
"services": service_results,
"total_records_cloned": total_records
})
logger.info(
f"Service {service_name} completed ({completed_count}/{total_count})",
session_id=context.session_id,
records_cloned=result.get("records_cloned", 0)
)
except Exception as e:
# Handle exceptions from the task itself
service_name = completed_tasks_info[completed_task]
service_def = next(s for s in services_to_clone if s.name == service_name)
logger.error(
f"Service {service_name} cloning failed with exception",
session_id=context.session_id,
error=str(e)
)
service_results[service_name] = {
"status": "failed",
"error": str(e),
"records_cloned": 0
}
failed_services.append(service_name)
completed_count += 1
if service_def.required:
required_service_failed = True
# Determine overall status
if required_service_failed:
overall_status = "failed"
elif failed_services:
overall_status = "partial"
else:
overall_status = "completed"
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
logger.info(
"Professional cloning strategy completed",
session_id=context.session_id,
overall_status=overall_status,
total_records=total_records,
failed_services=failed_services,
duration_ms=duration_ms
)
return {
"overall_status": overall_status,
"services": service_results,
"total_records": total_records,
"failed_services": failed_services,
"duration_ms": duration_ms,
"rollback_stack": rollback_stack
}
except Exception as e:
logger.error(
"Professional cloning strategy failed",
session_id=context.session_id,
error=str(e),
exc_info=True
)
return {
"overall_status": "failed",
"error": str(e),
"services": {},
"total_records": 0,
"failed_services": [],
"duration_ms": int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000),
"rollback_stack": rollback_stack
}
class EnterpriseCloningStrategy(CloningStrategy):
"""
Strategy for multi-tenant enterprise demos
Clones parent tenant + child tenants + distribution data
"""
def get_strategy_name(self) -> str:
return "enterprise"
async def clone(self, context: CloningContext) -> Dict[str, Any]:
"""
Clone demo data for an enterprise (multi-tenant) account
Process:
1. Validate enterprise metadata
2. Clone parent tenant using ProfessionalCloningStrategy
3. Clone child tenants in parallel
4. Update distribution data with child mappings
5. Return aggregated results
NOTE: No recursion - uses ProfessionalCloningStrategy as a helper
"""
logger.info(
"Executing enterprise cloning strategy",
session_id=context.session_id,
parent_tenant_id=context.virtual_tenant_id,
base_tenant_id=context.base_tenant_id
)
start_time = datetime.now(timezone.utc)
results = {
"parent": {},
"children": [],
"distribution": {},
"overall_status": "pending"
}
rollback_stack = []
try:
# Validate enterprise metadata
if not context.session_metadata:
raise ValueError("Enterprise cloning requires session_metadata")
is_enterprise = context.session_metadata.get("is_enterprise", False)
child_configs = context.session_metadata.get("child_configs", [])
child_tenant_ids = context.session_metadata.get("child_tenant_ids", [])
if not is_enterprise:
raise ValueError("session_metadata.is_enterprise must be True")
if not child_configs or not child_tenant_ids:
raise ValueError("Enterprise metadata missing child_configs or child_tenant_ids")
logger.info(
"Enterprise metadata validated",
session_id=context.session_id,
child_count=len(child_configs)
)
# Phase 1: Clone parent tenant
logger.info("Phase 1: Cloning parent tenant", session_id=context.session_id)
# Update progress
await context.orchestrator._update_progress_in_redis(context.session_id, {
"parent": {"overall_status": "pending"},
"children": [],
"distribution": {}
})
# Use ProfessionalCloningStrategy to clone parent
# This is composition, not recursion - explicit strategy usage
professional_strategy = ProfessionalCloningStrategy()
parent_context = CloningContext(
base_tenant_id=context.base_tenant_id,
virtual_tenant_id=context.virtual_tenant_id,
session_id=context.session_id,
demo_account_type="enterprise", # Explicit type for parent tenant
session_metadata=context.session_metadata,
orchestrator=context.orchestrator
)
parent_result = await professional_strategy.clone(parent_context)
results["parent"] = parent_result
# Update progress
await context.orchestrator._update_progress_in_redis(context.session_id, {
"parent": parent_result,
"children": [],
"distribution": {}
})
# Track parent for rollback
if parent_result.get("overall_status") not in ["failed"]:
rollback_stack.append({
"type": "tenant",
"tenant_id": context.virtual_tenant_id,
"session_id": context.session_id
})
# Validate parent success
parent_status = parent_result.get("overall_status")
if parent_status == "failed":
logger.error(
"Parent cloning failed, aborting enterprise demo",
session_id=context.session_id,
failed_services=parent_result.get("failed_services", [])
)
results["overall_status"] = "failed"
results["error"] = "Parent tenant cloning failed"
results["duration_ms"] = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
return results
if parent_status == "partial":
# Check if tenant service succeeded (critical)
parent_services = parent_result.get("services", {})
if parent_services.get("tenant", {}).get("status") != "completed":
logger.error(
"Tenant service failed in parent, cannot create children",
session_id=context.session_id
)
results["overall_status"] = "failed"
results["error"] = "Parent tenant creation failed - cannot create child tenants"
results["duration_ms"] = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
return results
logger.info(
"Parent cloning succeeded, proceeding with children",
session_id=context.session_id,
parent_status=parent_status
)
# Phase 2: Clone child tenants in parallel
logger.info(
"Phase 2: Cloning child outlets",
session_id=context.session_id,
child_count=len(child_configs)
)
# Update progress
await context.orchestrator._update_progress_in_redis(context.session_id, {
"parent": parent_result,
"children": [{"status": "pending"} for _ in child_configs],
"distribution": {}
})
# Import asyncio for parallel execution
import asyncio
child_tasks = []
for idx, (child_config, child_id) in enumerate(zip(child_configs, child_tenant_ids)):
task = context.orchestrator._clone_child_outlet(
base_tenant_id=child_config.get("base_tenant_id"),
virtual_child_id=child_id,
parent_tenant_id=context.virtual_tenant_id,
child_name=child_config.get("name"),
location=child_config.get("location"),
session_id=context.session_id
)
child_tasks.append(task)
child_results = await asyncio.gather(*child_tasks, return_exceptions=True)
# Process child results
children_data = []
failed_children = 0
for idx, result in enumerate(child_results):
if isinstance(result, Exception):
logger.error(
f"Child {idx} cloning failed",
session_id=context.session_id,
error=str(result)
)
children_data.append({
"status": "failed",
"error": str(result),
"child_id": child_tenant_ids[idx] if idx < len(child_tenant_ids) else None
})
failed_children += 1
else:
children_data.append(result)
if result.get("overall_status") == "failed":
failed_children += 1
else:
# Track for rollback
rollback_stack.append({
"type": "tenant",
"tenant_id": result.get("child_id"),
"session_id": context.session_id
})
results["children"] = children_data
# Update progress
await context.orchestrator._update_progress_in_redis(context.session_id, {
"parent": parent_result,
"children": children_data,
"distribution": {}
})
logger.info(
"Child cloning completed",
session_id=context.session_id,
total_children=len(child_configs),
failed_children=failed_children
)
# Phase 3: Clone distribution data
logger.info("Phase 3: Cloning distribution data", session_id=context.session_id)
# Find distribution service definition
dist_service_def = next(
(s for s in context.orchestrator.services if s.name == "distribution"),
None
)
if dist_service_def:
dist_result = await context.orchestrator._clone_service(
service_def=dist_service_def,
base_tenant_id=context.base_tenant_id,
virtual_tenant_id=context.virtual_tenant_id,
demo_account_type="enterprise",
session_id=context.session_id,
session_metadata=context.session_metadata
)
results["distribution"] = dist_result
# Update progress
await context.orchestrator._update_progress_in_redis(context.session_id, {
"parent": parent_result,
"children": children_data,
"distribution": dist_result
})
# Track for rollback
if dist_result.get("status") == "completed":
rollback_stack.append({
"type": "service",
"service_name": "distribution",
"tenant_id": context.virtual_tenant_id,
"session_id": context.session_id
})
total_records_cloned = parent_result.get("total_records", 0)
total_records_cloned += dist_result.get("records_cloned", 0)
else:
logger.warning("Distribution service not found in orchestrator", session_id=context.session_id)
# Determine overall status
if failed_children == len(child_configs):
overall_status = "failed"
elif failed_children > 0:
overall_status = "partial"
else:
overall_status = "completed" # Changed from "ready" to match professional strategy
# Calculate total records cloned (parent + all children)
total_records_cloned = parent_result.get("total_records", 0)
for child in children_data:
if isinstance(child, dict):
total_records_cloned += child.get("total_records", child.get("records_cloned", 0))
results["overall_status"] = overall_status
results["total_records_cloned"] = total_records_cloned # Add for session manager
results["duration_ms"] = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
results["rollback_stack"] = rollback_stack
# Include services from parent for session manager compatibility
results["services"] = parent_result.get("services", {})
logger.info(
"Enterprise cloning strategy completed",
session_id=context.session_id,
overall_status=overall_status,
parent_status=parent_status,
children_status=f"{len(child_configs) - failed_children}/{len(child_configs)} succeeded",
total_records_cloned=total_records_cloned,
duration_ms=results["duration_ms"]
)
return results
except Exception as e:
logger.error(
"Enterprise cloning strategy failed",
session_id=context.session_id,
error=str(e),
exc_info=True
)
return {
"overall_status": "failed",
"error": str(e),
"parent": {},
"children": [],
"distribution": {},
"duration_ms": int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000),
"rollback_stack": rollback_stack
}
class CloningStrategyFactory:
"""
Factory for creating cloning strategies
Provides type-safe strategy selection
"""
_strategies: Dict[str, CloningStrategy] = {
"professional": ProfessionalCloningStrategy(),
"enterprise": EnterpriseCloningStrategy(),
"enterprise_child": ProfessionalCloningStrategy() # Alias: children use professional strategy
}
@classmethod
def get_strategy(cls, demo_account_type: str) -> CloningStrategy:
"""
Get the appropriate cloning strategy for the demo account type
Args:
demo_account_type: Type of demo account ("professional" or "enterprise")
Returns:
CloningStrategy instance
Raises:
ValueError: If demo_account_type is not supported
"""
strategy = cls._strategies.get(demo_account_type)
if not strategy:
raise ValueError(
f"Unknown demo_account_type: {demo_account_type}. "
f"Supported types: {list(cls._strategies.keys())}"
)
return strategy
@classmethod
def register_strategy(cls, name: str, strategy: CloningStrategy):
"""
Register a custom cloning strategy
Args:
name: Strategy name
strategy: Strategy instance
"""
cls._strategies[name] = strategy
logger.info(f"Registered custom cloning strategy: {name}")

View File

@@ -1,356 +0,0 @@
"""
Demo Data Cloner
Clones base demo data to session-specific virtual tenants
"""
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Dict, Any, List, Optional
import httpx
import structlog
import uuid
import os
import asyncio
from app.core.redis_wrapper import DemoRedisWrapper
from app.core import settings
logger = structlog.get_logger()
class DemoDataCloner:
"""Clones demo data for isolated sessions"""
def __init__(self, db: AsyncSession, redis: DemoRedisWrapper):
self.db = db
self.redis = redis
self._http_client: Optional[httpx.AsyncClient] = None
async def get_http_client(self) -> httpx.AsyncClient:
"""Get or create shared HTTP client with connection pooling"""
if self._http_client is None:
self._http_client = httpx.AsyncClient(
timeout=httpx.Timeout(30.0, connect_timeout=10.0),
limits=httpx.Limits(
max_connections=20,
max_keepalive_connections=10,
keepalive_expiry=30.0
)
)
return self._http_client
async def close(self):
"""Close HTTP client on cleanup"""
if self._http_client:
await self._http_client.aclose()
self._http_client = None
async def clone_tenant_data(
self,
session_id: str,
base_demo_tenant_id: str,
virtual_tenant_id: str,
demo_account_type: str
) -> Dict[str, Any]:
"""
Clone all demo data from base tenant to virtual tenant
Args:
session_id: Session ID
base_demo_tenant_id: Base demo tenant UUID
virtual_tenant_id: Virtual tenant UUID for this session
demo_account_type: Type of demo account
Returns:
Cloning statistics
"""
logger.info(
"Starting data cloning",
session_id=session_id,
base_demo_tenant_id=base_demo_tenant_id,
virtual_tenant_id=virtual_tenant_id
)
stats = {
"session_id": session_id,
"services_cloned": [],
"total_records": 0,
"redis_keys": 0
}
# Clone data from each service based on demo account type
services_to_clone = self._get_services_for_demo_type(demo_account_type)
for service_name in services_to_clone:
try:
service_stats = await self._clone_service_data(
service_name,
base_demo_tenant_id,
virtual_tenant_id,
session_id,
demo_account_type
)
stats["services_cloned"].append(service_name)
stats["total_records"] += service_stats.get("records_cloned", 0)
except Exception as e:
logger.error(
"Failed to clone service data",
service=service_name,
error=str(e)
)
# Populate Redis cache with hot data
redis_stats = await self._populate_redis_cache(
session_id,
virtual_tenant_id,
demo_account_type
)
stats["redis_keys"] = redis_stats.get("keys_created", 0)
logger.info(
"Data cloning completed",
session_id=session_id,
stats=stats
)
return stats
def _get_services_for_demo_type(self, demo_account_type: str) -> List[str]:
"""Get list of services to clone based on demo type"""
base_services = ["inventory", "sales", "orders", "pos"]
if demo_account_type == "professional":
# Professional has production, recipes, suppliers, and procurement
return base_services + ["recipes", "production", "suppliers", "procurement", "alert_processor"]
elif demo_account_type == "enterprise":
# Enterprise has suppliers, procurement, and distribution (for parent-child network)
return base_services + ["suppliers", "procurement", "distribution", "alert_processor"]
else:
# Basic tenant has suppliers and procurement
return base_services + ["suppliers", "procurement", "distribution", "alert_processor"]
async def _clone_service_data(
self,
service_name: str,
base_tenant_id: str,
virtual_tenant_id: str,
session_id: str,
demo_account_type: str
) -> Dict[str, Any]:
"""
Clone data for a specific service
Args:
service_name: Name of the service
base_tenant_id: Source tenant ID
virtual_tenant_id: Target tenant ID
session_id: Session ID
demo_account_type: Type of demo account
Returns:
Cloning statistics
"""
service_url = self._get_service_url(service_name)
# Get internal API key from settings
from app.core.config import settings
internal_api_key = settings.INTERNAL_API_KEY
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{service_url}/internal/demo/clone",
json={
"base_tenant_id": base_tenant_id,
"virtual_tenant_id": virtual_tenant_id,
"session_id": session_id,
"demo_account_type": demo_account_type
},
headers={"X-Internal-API-Key": internal_api_key}
)
response.raise_for_status()
return response.json()
async def _populate_redis_cache(
self,
session_id: str,
virtual_tenant_id: str,
demo_account_type: str
) -> Dict[str, Any]:
"""
Populate Redis with frequently accessed data
Args:
session_id: Session ID
virtual_tenant_id: Virtual tenant ID
demo_account_type: Demo account type
Returns:
Statistics about cached data
"""
logger.info("Populating Redis cache", session_id=session_id)
keys_created = 0
# Cache inventory data (hot data)
try:
inventory_data = await self._fetch_inventory_data(virtual_tenant_id)
await self.redis.set_session_data(
session_id,
"inventory",
inventory_data,
ttl=settings.REDIS_SESSION_TTL
)
keys_created += 1
except Exception as e:
logger.error("Failed to cache inventory", error=str(e))
# Cache POS data
try:
pos_data = await self._fetch_pos_data(virtual_tenant_id)
await self.redis.set_session_data(
session_id,
"pos",
pos_data,
ttl=settings.REDIS_SESSION_TTL
)
keys_created += 1
except Exception as e:
logger.error("Failed to cache POS data", error=str(e))
# Cache recent sales
try:
sales_data = await self._fetch_recent_sales(virtual_tenant_id)
await self.redis.set_session_data(
session_id,
"recent_sales",
sales_data,
ttl=settings.REDIS_SESSION_TTL
)
keys_created += 1
except Exception as e:
logger.error("Failed to cache sales", error=str(e))
return {"keys_created": keys_created}
async def _fetch_inventory_data(self, tenant_id: str) -> Dict[str, Any]:
"""Fetch inventory data for caching"""
async with httpx.AsyncClient(timeout=httpx.Timeout(15.0, connect_timeout=5.0)) as client:
response = await client.get(
f"{settings.INVENTORY_SERVICE_URL}/api/inventory/summary",
headers={"X-Tenant-Id": tenant_id}
)
return response.json()
async def _fetch_pos_data(self, tenant_id: str) -> Dict[str, Any]:
"""Fetch POS data for caching"""
async with httpx.AsyncClient(timeout=httpx.Timeout(15.0, connect_timeout=5.0)) as client:
response = await client.get(
f"{settings.POS_SERVICE_URL}/api/pos/current-session",
headers={"X-Tenant-Id": tenant_id}
)
return response.json()
async def _fetch_recent_sales(self, tenant_id: str) -> Dict[str, Any]:
"""Fetch recent sales for caching"""
async with httpx.AsyncClient() as client:
response = await client.get(
f"{settings.SALES_SERVICE_URL}/api/sales/recent?limit=50",
headers={"X-Tenant-Id": tenant_id}
)
return response.json()
def _get_service_url(self, service_name: str) -> str:
"""Get service URL from settings"""
url_map = {
"inventory": settings.INVENTORY_SERVICE_URL,
"recipes": settings.RECIPES_SERVICE_URL,
"sales": settings.SALES_SERVICE_URL,
"orders": settings.ORDERS_SERVICE_URL,
"production": settings.PRODUCTION_SERVICE_URL,
"suppliers": settings.SUPPLIERS_SERVICE_URL,
"pos": settings.POS_SERVICE_URL,
"procurement": settings.PROCUREMENT_SERVICE_URL,
"distribution": settings.DISTRIBUTION_SERVICE_URL,
"forecasting": settings.FORECASTING_SERVICE_URL,
"alert_processor": settings.ALERT_PROCESSOR_SERVICE_URL,
}
return url_map.get(service_name, "")
async def delete_session_data(
self,
virtual_tenant_id: str,
session_id: str
):
"""
Delete all data for a session using parallel deletion for performance
Args:
virtual_tenant_id: Virtual tenant ID to delete
session_id: Session ID
"""
logger.info(
"Deleting session data",
virtual_tenant_id=virtual_tenant_id,
session_id=session_id
)
# Get shared HTTP client for all deletions
client = await self.get_http_client()
# Services list - all can be deleted in parallel as deletion endpoints
# handle their own internal ordering if needed
services = [
"forecasting",
"sales",
"orders",
"production",
"inventory",
"recipes",
"suppliers",
"pos",
"distribution",
"procurement",
"alert_processor"
]
# Create deletion tasks for all services
deletion_tasks = [
self._delete_service_data(service_name, virtual_tenant_id, client)
for service_name in services
]
# Execute all deletions in parallel with exception handling
results = await asyncio.gather(*deletion_tasks, return_exceptions=True)
# Log any failures
for service_name, result in zip(services, results):
if isinstance(result, Exception):
logger.error(
"Failed to delete service data",
service=service_name,
error=str(result)
)
# Delete from Redis
await self.redis.delete_session_data(session_id)
logger.info("Session data deleted", virtual_tenant_id=virtual_tenant_id)
async def _delete_service_data(
self,
service_name: str,
virtual_tenant_id: str,
client: httpx.AsyncClient
):
"""Delete data from a specific service using provided HTTP client"""
service_url = self._get_service_url(service_name)
# Get internal API key from settings
from app.core.config import settings
internal_api_key = settings.INTERNAL_API_KEY
await client.delete(
f"{service_url}/internal/demo/tenant/{virtual_tenant_id}",
headers={"X-Internal-API-Key": internal_api_key}
)

View File

@@ -75,18 +75,11 @@ class DemoSessionManager:
base_tenant_id = uuid.UUID(base_tenant_id_str)
# Validate that the base tenant ID exists in the tenant service
# This is important to prevent cloning from non-existent base tenants
await self._validate_base_tenant_exists(base_tenant_id, demo_account_type)
# Handle enterprise chain setup
child_tenant_ids = []
if demo_account_type == 'enterprise':
# Validate child template tenants exist before proceeding
child_configs = demo_config.get('children', [])
await self._validate_child_template_tenants(child_configs)
# Generate child tenant IDs for enterprise demos
child_configs = demo_config.get('children', [])
child_tenant_ids = [uuid.uuid4() for _ in child_configs]
# Create session record using repository
@@ -208,9 +201,7 @@ class DemoSessionManager:
async def destroy_session(self, session_id: str):
"""
Destroy a demo session and cleanup resources
Args:
session_id: Session ID to destroy
This triggers parallel deletion across all services.
"""
session = await self.get_session(session_id)
@@ -218,8 +209,30 @@ class DemoSessionManager:
logger.warning("Session not found for destruction", session_id=session_id)
return
# Update session status via repository
await self.repository.destroy(session_id)
# Update status to DESTROYING
await self.repository.update_fields(
session_id,
status=DemoSessionStatus.DESTROYING
)
# Trigger cleanup across all services
cleanup_service = DemoCleanupService(self.db, self.redis)
result = await cleanup_service.cleanup_session(session)
if result["success"]:
# Update status to DESTROYED
await self.repository.update_fields(
session_id,
status=DemoSessionStatus.DESTROYED,
destroyed_at=datetime.now(timezone.utc)
)
else:
# Update status to FAILED with error details
await self.repository.update_fields(
session_id,
status=DemoSessionStatus.FAILED,
error_details=result["errors"]
)
# Delete Redis data
await self.redis.delete_session_data(session_id)
@@ -227,9 +240,34 @@ class DemoSessionManager:
logger.info(
"Session destroyed",
session_id=session_id,
virtual_tenant_id=str(session.virtual_tenant_id)
virtual_tenant_id=str(session.virtual_tenant_id),
total_records_deleted=result.get("total_deleted", 0),
duration_ms=result.get("duration_ms", 0)
)
async def _check_database_disk_space(self):
"""Check if database has sufficient disk space for demo operations"""
try:
# Execute a simple query to check database health and disk space
# This is a basic check - in production you might want more comprehensive monitoring
from sqlalchemy import text
# Check if we can execute a simple query (indicates basic database health)
result = await self.db.execute(text("SELECT 1"))
# Get the scalar result properly
scalar_result = result.scalar_one_or_none()
# For more comprehensive checking, you could add:
# 1. Check table sizes
# 2. Check available disk space via system queries (if permissions allow)
# 3. Check for long-running transactions that might block operations
logger.debug("Database health check passed", result=scalar_result)
except Exception as e:
logger.error("Database health check failed", error=str(e), exc_info=True)
raise RuntimeError(f"Database health check failed: {str(e)}")
async def _store_session_metadata(self, session: DemoSession):
"""Store session metadata in Redis"""
await self.redis.set_session_data(
@@ -274,6 +312,33 @@ class DemoSessionManager:
virtual_tenant_id=str(session.virtual_tenant_id)
)
# Check database disk space before starting cloning
try:
await self._check_database_disk_space()
except Exception as e:
logger.error(
"Database disk space check failed",
session_id=session.session_id,
error=str(e)
)
# Mark session as failed due to infrastructure issue
session.status = DemoSessionStatus.FAILED
session.cloning_completed_at = datetime.now(timezone.utc)
session.total_records_cloned = 0
session.cloning_progress = {
"error": "Database disk space issue detected",
"details": str(e)
}
await self.repository.update(session)
await self._cache_session_status(session)
return {
"overall_status": "failed",
"services": {},
"total_records": 0,
"failed_services": ["database"],
"error": "Database disk space issue"
}
# Mark cloning as started and update both database and Redis cache
session.cloning_started_at = datetime.now(timezone.utc)
await self.repository.update(session)
@@ -295,130 +360,7 @@ class DemoSessionManager:
return result
async def _validate_base_tenant_exists(self, base_tenant_id: uuid.UUID, demo_account_type: str) -> bool:
"""
Validate that the base tenant exists in the tenant service before starting cloning.
This prevents cloning from non-existent base tenants.
Args:
base_tenant_id: The UUID of the base tenant to validate
demo_account_type: The demo account type for logging
Returns:
True if tenant exists, raises exception otherwise
"""
logger.info(
"Validating base tenant exists before cloning",
base_tenant_id=str(base_tenant_id),
demo_account_type=demo_account_type
)
# Basic validation: check if UUID is valid (not empty/nil)
if str(base_tenant_id) == "00000000-0000-0000-0000-000000000000":
raise ValueError(f"Invalid base tenant ID: {base_tenant_id} for demo type: {demo_account_type}")
# BUG-008 FIX: Actually validate with tenant service
try:
from shared.clients.tenant_client import TenantServiceClient
tenant_client = TenantServiceClient(settings)
tenant = await tenant_client.get_tenant(str(base_tenant_id))
if not tenant:
error_msg = (
f"Base tenant {base_tenant_id} does not exist for demo type {demo_account_type}. "
f"Please verify the base_tenant_id in demo configuration."
)
logger.error(
"Base tenant validation failed",
base_tenant_id=str(base_tenant_id),
demo_account_type=demo_account_type
)
raise ValueError(error_msg)
logger.info(
"Base tenant validation passed",
base_tenant_id=str(base_tenant_id),
tenant_name=tenant.get("name", "unknown"),
demo_account_type=demo_account_type
)
return True
except ValueError:
# Re-raise ValueError from validation failure
raise
except Exception as e:
logger.error(
f"Error validating base tenant: {e}",
base_tenant_id=str(base_tenant_id),
demo_account_type=demo_account_type,
exc_info=True
)
raise ValueError(f"Cannot validate base tenant {base_tenant_id}: {str(e)}")
async def _validate_child_template_tenants(self, child_configs: list) -> bool:
"""
Validate that all child template tenants exist before cloning.
This prevents silent failures when child base tenants are missing.
Args:
child_configs: List of child configurations with base_tenant_id
Returns:
True if all child templates exist, raises exception otherwise
"""
if not child_configs:
logger.warning("No child configurations provided for validation")
return True
logger.info("Validating child template tenants", child_count=len(child_configs))
try:
from shared.clients.tenant_client import TenantServiceClient
tenant_client = TenantServiceClient(settings)
for child_config in child_configs:
child_base_id = child_config.get("base_tenant_id")
child_name = child_config.get("name", "unknown")
if not child_base_id:
raise ValueError(f"Child config missing base_tenant_id: {child_name}")
# Validate child template exists
child_tenant = await tenant_client.get_tenant(child_base_id)
if not child_tenant:
error_msg = (
f"Child template tenant {child_base_id} ('{child_name}') does not exist. "
f"Please verify the base_tenant_id in demo configuration."
)
logger.error(
"Child template validation failed",
base_tenant_id=child_base_id,
child_name=child_name
)
raise ValueError(error_msg)
logger.info(
"Child template validation passed",
base_tenant_id=child_base_id,
child_name=child_name,
tenant_name=child_tenant.get("name", "unknown")
)
logger.info("All child template tenants validated successfully")
return True
except ValueError:
# Re-raise ValueError from validation failure
raise
except Exception as e:
logger.error(
f"Error validating child template tenants: {e}",
exc_info=True
)
raise ValueError(f"Cannot validate child template tenants: {str(e)}")
async def _update_session_from_clone_result(
self,
@@ -573,4 +515,4 @@ class DemoSessionManager:
# Trigger new cloning attempt
result = await self.trigger_orchestrated_cloning(session, base_tenant_id)
return result
return result