Files
bakery-ia/services/training/tests/conftest.py
2025-07-25 14:10:27 +02:00

1632 lines
55 KiB
Python

# ================================================================
# services/training/tests/conftest.py
# ================================================================
"""
Test configuration and fixtures for Training Service
Provides shared fixtures, mock data, and test utilities
"""
import pytest
import asyncio
import pandas as pd
import numpy as np
import tempfile
import os
import json
from datetime import datetime, timedelta
from unittest.mock import Mock, AsyncMock, patch
from typing import Dict, List, Any, Generator
from pathlib import Path
import logging
from app.models.training import ModelTrainingLog, TrainedModel
# Configure pytest-asyncio
pytestmark = pytest.mark.asyncio
# Suppress Prophet logging during tests
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)
# ================================================================
# PYTEST CONFIGURATION
# ================================================================
@pytest.fixture
def large_dataset_for_performance():
"""Generate large dataset for performance testing"""
# Generate 2 years of data with 15 products
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 1, 1)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
products = [
"Pan Integral", "Pan Blanco", "Croissant", "Magdalenas",
"Empanadas", "Tarta Chocolate", "Roscon Reyes", "Palmeras",
"Donuts", "Berlinas", "Napolitanas", "Ensaimadas",
"Baguette", "Pan de Molde", "Bizcocho"
]
data = []
for date in date_range:
for product in products:
# Realistic sales with patterns
base_quantity = np.random.randint(5, 150)
# Seasonal patterns
if date.month in [12, 1]: # Winter/Holiday season
base_quantity *= 1.4
elif date.month in [6, 7, 8]: # Summer
base_quantity *= 0.8
# Weekly patterns
if date.weekday() >= 5: # Weekends
base_quantity *= 1.2
elif date.weekday() == 0: # Monday
base_quantity *= 0.7
# Add noise
quantity = max(1, int(base_quantity + np.random.normal(0, base_quantity * 0.1)))
data.append({
"date": date.strftime("%Y-%m-%d"),
"product": product,
"quantity": quantity,
"revenue": round(quantity * np.random.uniform(1.5, 8.0), 2),
"temperature": round(15 + 12 * np.sin((date.timetuple().tm_yday / 365) * 2 * np.pi) + np.random.normal(0, 3), 1),
"precipitation": max(0, np.random.exponential(0.8)),
"is_weekend": date.weekday() >= 5,
"is_holiday": _is_spanish_holiday(date)
})
return pd.DataFrame(data)
@pytest.fixture
def memory_monitor():
"""Memory monitoring utility for performance tests"""
import psutil
import gc
class MemoryMonitor:
def __init__(self):
self.process = psutil.Process()
self.snapshots = []
def snapshot(self, label: str):
gc.collect() # Force garbage collection
memory_mb = self.process.memory_info().rss / 1024 / 1024
self.snapshots.append({
'label': label,
'memory_mb': memory_mb,
'timestamp': datetime.now()
})
return memory_mb
def get_peak_usage(self):
if not self.snapshots:
return 0
return max(s['memory_mb'] for s in self.snapshots)
def get_usage_increase(self):
if len(self.snapshots) < 2:
return 0
return self.snapshots[-1]['memory_mb'] - self.snapshots[0]['memory_mb']
def report(self):
print("\n=== Memory Usage Report ===")
for snapshot in self.snapshots:
print(f"{snapshot['label']}: {snapshot['memory_mb']:.2f} MB")
print(f"Peak Usage: {self.get_peak_usage():.2f} MB")
print(f"Total Increase: {self.get_usage_increase():.2f} MB")
return MemoryMonitor()
@pytest.fixture
def timing_monitor():
"""Timing monitoring utility for performance tests"""
import time
class TimingMonitor:
def __init__(self):
self.timings = []
self.start_time = None
def start(self, label: str):
self.start_time = time.time()
self.current_label = label
def stop(self):
if self.start_time is None:
return 0
duration = time.time() - self.start_time
self.timings.append({
'label': self.current_label,
'duration': duration
})
self.start_time = None
return duration
def get_total_time(self):
return sum(t['duration'] for t in self.timings)
def report(self):
print("\n=== Timing Report ===")
for timing in self.timings:
print(f"{timing['label']}: {timing['duration']:.2f}s")
print(f"Total Time: {self.get_total_time():.2f}s")
return TimingMonitor()
# ================================================================
# INTEGRATION TEST FIXTURES
# ================================================================
@pytest.fixture
async def integration_test_setup(
mock_external_services,
sample_bakery_sales_data,
temp_model_storage
):
"""Complete setup for integration tests"""
# Patch model storage path
with patch('app.core.config.settings.MODEL_STORAGE_PATH', str(temp_model_storage)):
# Patch data fetching to use sample data
with patch('app.services.training_service.TrainingService._fetch_sales_data') as mock_fetch:
mock_fetch.return_value = sample_bakery_sales_data
yield {
'external_services': mock_external_services,
'sales_data': sample_bakery_sales_data,
'model_storage': temp_model_storage,
'mock_fetch': mock_fetch
}
@pytest.fixture
def mock_messaging():
"""Mock messaging system for testing"""
with patch('app.services.messaging.publish_job_started') as mock_started, \
patch('app.services.messaging.publish_job_completed') as mock_completed, \
patch('app.services.messaging.publish_job_failed') as mock_failed, \
patch('app.services.messaging.publish_model_trained') as mock_model:
yield {
'publish_job_started': mock_started,
'publish_job_completed': mock_completed,
'publish_job_failed': mock_failed,
'publish_model_trained': mock_model
}
# ================================================================
# API TEST FIXTURES
# ================================================================
@pytest.fixture
async def test_app():
"""Test FastAPI application instance"""
from app.main import app
return app
@pytest.fixture
def test_client(test_app):
"""Create test client for API testing - SYNC VERSION"""
from httpx import Client
with Client(app=test_app, base_url="http://test") as client:
yield client
@pytest.fixture
def auth_headers():
"""Mock authentication headers"""
return {
"Authorization": "Bearer test_token_123",
"X-Tenant-ID": "test_tenant_123"
}
# ================================================================
# ERROR SIMULATION FIXTURES
# ================================================================
@pytest.fixture
def failing_external_services():
"""Mock external services that fail for error testing"""
with patch('app.external.aemet.AEMETClient') as mock_aemet, \
patch('app.external.madrid_opendata.MadridOpenDataClient') as mock_madrid:
# Configure to raise exceptions
mock_aemet_instance = AsyncMock()
mock_aemet.return_value = mock_aemet_instance
mock_aemet_instance.get_historical_weather.side_effect = Exception("AEMET API Error")
mock_madrid_instance = AsyncMock()
mock_madrid.return_value = mock_madrid_instance
mock_madrid_instance.get_historical_traffic.side_effect = Exception("Madrid API Error")
yield {
'aemet': mock_aemet_instance,
'madrid': mock_madrid_instance
}
@pytest.fixture
def corrupted_sales_data(sample_bakery_sales_data):
"""Sales data with various quality issues for testing"""
corrupted_data = sample_bakery_sales_data.copy()
# Introduce missing values (20% of quantity data)
missing_mask = np.random.random(len(corrupted_data)) < 0.2
corrupted_data.loc[missing_mask, 'quantity'] = np.nan
# Introduce extreme outliers (1% of data)
outlier_mask = np.random.random(len(corrupted_data)) < 0.01
corrupted_data.loc[outlier_mask, 'quantity'] *= 100
# Introduce inconsistent dates (0.5% of data)
future_mask = np.random.random(len(corrupted_data)) < 0.005
corrupted_data.loc[future_mask, 'date'] = "2025-12-31"
# Introduce negative values (0.2% of data)
negative_mask = np.random.random(len(corrupted_data)) < 0.002
corrupted_data.loc[negative_mask, 'quantity'] = -10
return corrupted_data
# ================================================================
# VALIDATION TEST FIXTURES
# ================================================================
@pytest.fixture
def insufficient_sales_data():
"""Sales data with insufficient volume for training"""
# Only 10 days of data
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(10)]
data = []
for date in dates:
data.append({
"date": date.strftime("%Y-%m-%d"),
"product": "Pan Integral",
"quantity": np.random.randint(10, 50),
"revenue": round(np.random.uniform(20, 100), 2),
"temperature": round(np.random.uniform(10, 25), 1),
"precipitation": 0.0,
"is_weekend": date.weekday() >= 5,
"is_holiday": False
})
return pd.DataFrame(data)
@pytest.fixture
def seasonal_product_data():
"""Data for seasonal product (Roscon Reyes) testing"""
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(365)]
data = []
for date in dates:
# Roscon Reyes has strong seasonal pattern (Christmas specialty)
base_qty = 5 # Very low base
if date.month == 12: # December - high sales
base_qty = 20 + (date.day - 1) * 2 # Increasing through December
elif date.month == 1 and date.day <= 6: # Until Epiphany
base_qty = 50
# Add some noise
quantity = max(1, int(base_qty + np.random.normal(0, base_qty * 0.2)))
data.append({
"date": date.strftime("%Y-%m-%d"),
"product": "Roscon Reyes",
"quantity": quantity,
"revenue": round(quantity * 25.0, 2), # Expensive specialty item
"temperature": round(15 + 12 * np.sin((date.timetuple().tm_yday / 365) * 2 * np.pi), 1),
"precipitation": max(0, np.random.exponential(0.5)),
"is_weekend": date.weekday() >= 5,
"is_holiday": _is_spanish_holiday(date)
})
return pd.DataFrame(data)
# ================================================================
# CLEANUP FIXTURES
# ================================================================
@pytest.fixture(autouse=True)
def cleanup_after_test():
"""Automatic cleanup after each test"""
yield
# Clean up any test files
import tempfile
import shutil
# Clear any temporary model files
temp_dirs = [d for d in os.listdir(tempfile.gettempdir()) if d.startswith('test_models_')]
for temp_dir in temp_dirs:
try:
shutil.rmtree(os.path.join(tempfile.gettempdir(), temp_dir))
except:
pass
# ================================================================
# TEST DATA VALIDATION UTILITIES
# ================================================================
class TestDataValidator:
"""Utility class for validating test data quality"""
@staticmethod
def validate_sales_data(df: pd.DataFrame) -> Dict[str, Any]:
"""Validate sales data structure and quality"""
required_columns = ['date', 'product', 'quantity', 'revenue']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return {'valid': False, 'error': f'Missing columns: {missing_columns}'}
# Check data types
try:
pd.to_datetime(df['date'])
except:
return {'valid': False, 'error': 'Invalid date format'}
if not pd.api.types.is_numeric_dtype(df['quantity']):
return {'valid': False, 'error': 'Quantity must be numeric'}
if not pd.api.types.is_numeric_dtype(df['revenue']):
return {'valid': False, 'error': 'Revenue must be numeric'}
# Check for negative values
if (df['quantity'] < 0).any():
return {'valid': False, 'error': 'Negative quantities found'}
if (df['revenue'] < 0).any():
return {'valid': False, 'error': 'Negative revenue found'}
return {'valid': True, 'rows': len(df), 'products': df['product'].nunique()}
@pytest.fixture
def test_data_validator():
"""Test data validator utility"""
return TestDataValidator()
# ================================================================
# LOGGING CONFIGURATION FOR TESTS
# ================================================================
@pytest.fixture(autouse=True)
def configure_test_logging():
"""Configure logging for tests"""
import logging
# Reduce log level for external libraries during tests
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
# Configure our app logging for tests
logger = logging.getLogger('app')
logger.setLevel(logging.INFO)
yield
# Reset logging after tests
logging.getLogger().handlers.clear()
# ================================================================
# ENVIRONMENT SETUP
# ================================================================
@pytest.fixture(scope="session", autouse=True)
def setup_test_environment():
"""Setup test environment variables"""
os.environ.update({
'ENVIRONMENT': 'test',
'LOG_LEVEL': 'INFO',
'MODEL_STORAGE_PATH': '/tmp/test_models',
'MAX_TRAINING_TIME_MINUTES': '5',
'MIN_TRAINING_DATA_DAYS': '7',
'PROPHET_SEASONALITY_MODE': 'additive',
'ENABLE_SYNTHETIC_DATA': 'true',
'SKIP_EXTERNAL_API_CALLS': 'true'
})
yield
# Cleanup environment - FIXED: removed (scope="session")
test_vars = [
'ENVIRONMENT', 'LOG_LEVEL', 'MODEL_STORAGE_PATH',
'MAX_TRAINING_TIME_MINUTES', 'MIN_TRAINING_DATA_DAYS',
'PROPHET_SEASONALITY_MODE', 'ENABLE_SYNTHETIC_DATA',
'SKIP_EXTERNAL_API_CALLS'
]
for var in test_vars:
os.environ.pop(var, None) # FIXED: removed the erroneous (scope="session")
def event_loop():
"""Create an instance of the default event loop for the test session."""
loop = asyncio.new_event_loop()
yield loop
loop.close()
def pytest_configure(config):
"""Configure pytest with custom markers"""
config.addinivalue_line(
"markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
)
config.addinivalue_line(
"markers", "integration: marks tests as integration tests"
)
config.addinivalue_line(
"markers", "unit: marks tests as unit tests"
)
config.addinivalue_line(
"markers", "performance: marks tests as performance tests"
)
config.addinivalue_line(
"markers", "external: marks tests that require external services"
)
def pytest_collection_modifyitems(config, items):
"""Modify test collection to add markers automatically"""
for item in items:
# Mark performance tests
if "performance" in item.nodeid:
item.add_marker(pytest.mark.performance)
item.add_marker(pytest.mark.slow)
# Mark integration tests
if "integration" in item.nodeid:
item.add_marker(pytest.mark.integration)
# Mark end-to-end tests
if "end_to_end" in item.nodeid:
item.add_marker(pytest.mark.integration)
item.add_marker(pytest.mark.external)
# Mark unit tests (default for others)
if not any(marker.name in ["integration", "performance"] for marker in item.iter_markers()):
item.add_marker(pytest.mark.unit)
# ================================================================
# TEST DATABASE FIXTURES
# ================================================================
@pytest_asyncio.fixture
async def test_db_session():
"""Create async test database session"""
from app.core.database import database_manager
async with database_manager.async_session_local() as session:
yield session
@pytest.fixture
def training_job_in_db(test_db_session):
"""Create a training job in database for testing"""
from app.models.training import ModelTrainingLog # Add this import
from datetime import datetime
job = ModelTrainingLog(
job_id="test-job-123",
tenant_id="test-tenant",
status="running",
progress=50,
current_step="Training models",
start_time=datetime.now(), # Use start_time, not started_at
config={"include_weather": True},
created_at=datetime.now(),
updated_at=datetime.now()
)
test_db_session.add(job)
test_db_session.commit()
test_db_session.refresh(job)
return job
@pytest.fixture
def trained_model_in_db(test_db_session):
"""Create a trained model in database for testing"""
from app.models.training import TrainedModel # Add this import
from datetime import datetime
model = TrainedModel(
model_id="test-model-123",
tenant_id="test-tenant",
product_name="Pan Integral",
model_type="prophet",
model_path="/tmp/test_model.pkl",
version=1,
training_samples=100,
features=["temperature", "humidity"],
hyperparameters={"seasonality_mode": "additive"},
training_metrics={"mae": 2.5, "mse": 8.3},
is_active=True,
created_at=datetime.now()
)
test_db_session.add(model)
test_db_session.commit()
test_db_session.refresh(model)
return model
# ================================================================
# SAMPLE DATA FIXTURES
# ================================================================
@pytest.fixture
def sample_bakery_sales_data():
"""Generate comprehensive bakery sales data for testing"""
# Generate 1 year of data
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(365)]
# Spanish bakery products with realistic patterns
products = [
"Pan Integral", "Pan Blanco", "Croissant", "Magdalenas",
"Empanadas", "Tarta Chocolate", "Roscon Reyes", "Palmeras",
"Donuts", "Berlinas", "Napolitanas", "Ensaimadas"
]
# Product-specific configurations
product_config = {
"Pan Integral": {"base": 80, "price": 2.80, "weekend_boost": 1.1, "seasonal": False},
"Pan Blanco": {"base": 120, "price": 2.50, "weekend_boost": 1.2, "seasonal": False},
"Croissant": {"base": 45, "price": 1.50, "weekend_boost": 1.4, "seasonal": False},
"Magdalenas": {"base": 30, "price": 1.20, "weekend_boost": 1.1, "seasonal": False},
"Empanadas": {"base": 25, "price": 3.50, "weekend_boost": 0.9, "seasonal": False},
"Tarta Chocolate": {"base": 15, "price": 18.00, "weekend_boost": 1.6, "seasonal": False},
"Roscon Reyes": {"base": 8, "price": 25.00, "weekend_boost": 1.0, "seasonal": True},
"Palmeras": {"base": 12, "price": 1.80, "weekend_boost": 1.2, "seasonal": False},
"Donuts": {"base": 20, "price": 1.40, "weekend_boost": 1.3, "seasonal": False},
"Berlinas": {"base": 18, "price": 1.60, "weekend_boost": 1.2, "seasonal": False},
"Napolitanas": {"base": 22, "price": 1.70, "weekend_boost": 1.1, "seasonal": False},
"Ensaimadas": {"base": 15, "price": 2.20, "weekend_boost": 1.0, "seasonal": False}
}
data = []
for date in dates:
# Calculate date-specific factors
day_of_year = date.timetuple().tm_yday
is_weekend = date.weekday() >= 5
is_holiday = _is_spanish_holiday(date)
# Madrid weather simulation
temp = 14 + 12 * np.sin((day_of_year / 365) * 2 * np.pi) + np.random.normal(0, 3)
precip = max(0, np.random.exponential(0.8))
for product in products:
config = product_config[product]
# Base quantity
base_qty = config["base"]
# Apply weekend boost
if is_weekend:
base_qty *= config["weekend_boost"]
# Apply holiday boost
if is_holiday:
base_qty *= 1.3
# Seasonal products (like Roscon Reyes for Christmas)
if config["seasonal"] and product == "Roscon Reyes":
if date.month == 12:
# Exponential increase through December
base_qty *= (1 + (date.day - 1) / 5)
elif date.month == 1 and date.day <= 6:
# High demand until Epiphany (Jan 6)
base_qty *= 3
else:
# Very low demand rest of year
base_qty *= 0.1
# Weather effects
if temp > 30: # Very hot days
if product in ["Pan Integral", "Pan Blanco"]:
base_qty *= 0.7 # Less bread
elif product in ["Donuts", "Berlinas"]:
base_qty *= 0.8 # Less fried items
elif temp < 5: # Cold days
base_qty *= 1.15 # More baked goods
# Add realistic noise and ensure minimum of 1
quantity = max(1, int(base_qty + np.random.normal(0, base_qty * 0.12)))
revenue = round(quantity * config["price"], 2)
data.append({
"date": date.strftime("%Y-%m-%d"),
"product": product,
"quantity": quantity,
"revenue": revenue,
"temperature": round(temp, 1),
"precipitation": round(precip, 2),
"is_weekend": is_weekend,
"is_holiday": is_holiday
})
return pd.DataFrame(data)
@pytest.fixture
def sample_weather_data():
"""Generate realistic Madrid weather data"""
start_date = datetime(2023, 1, 1)
weather_data = []
for i in range(365):
date = start_date + timedelta(days=i)
day_of_year = date.timetuple().tm_yday
# Madrid climate simulation
base_temp = 14 + 12 * np.sin((day_of_year / 365) * 2 * np.pi)
# Seasonal humidity patterns
base_humidity = 50 + 20 * np.sin((day_of_year / 365) * 2 * np.pi + np.pi)
weather_data.append({
"date": date,
"temperature": round(base_temp + np.random.normal(0, 4), 1),
"precipitation": max(0, np.random.exponential(1.2)),
"humidity": np.random.uniform(25, 75),
"wind_speed": np.random.uniform(3, 20),
"pressure": np.random.uniform(995, 1025),
"description": np.random.choice([
"Soleado", "Parcialmente nublado", "Nublado",
"Lluvia ligera", "Despejado", "Variable"
]),
"source": "aemet_test"
})
return weather_data
@pytest.fixture
def sample_traffic_data():
"""Generate realistic Madrid traffic data"""
start_date = datetime(2023, 1, 1)
traffic_data = []
for i in range(365):
date = start_date + timedelta(days=i)
# Generate multiple measurements per day
for hour in range(6, 22, 2): # Every 2 hours from 6 AM to 10 PM
measurement_time = date.replace(hour=hour)
# Madrid traffic patterns
if hour in [7, 8, 9, 18, 19, 20]: # Rush hours
volume = np.random.randint(1200, 2000)
congestion = "high"
speed = np.random.randint(10, 25)
occupation = np.random.randint(60, 90)
elif hour in [12, 13, 14]: # Lunch time
volume = np.random.randint(800, 1200)
congestion = "medium"
speed = np.random.randint(20, 35)
occupation = np.random.randint(40, 70)
else: # Off-peak
volume = np.random.randint(300, 800)
congestion = "low"
speed = np.random.randint(30, 50)
occupation = np.random.randint(15, 50)
# Weekend adjustment
if date.weekday() >= 5:
volume = int(volume * 0.8) # Less traffic on weekends
speed = min(50, int(speed * 1.2)) # Faster speeds
traffic_data.append({
"date": measurement_time,
"traffic_volume": volume,
"occupation_percentage": occupation,
"load_percentage": min(95, occupation + np.random.randint(5, 15)),
"average_speed": speed,
"congestion_level": congestion,
"pedestrian_count": np.random.randint(100, 800),
"measurement_point_id": "MADRID_TEST_001",
"measurement_point_name": "Plaza Mayor",
"road_type": "URB",
"source": "madrid_opendata_test"
})
return traffic_data
# ================================================================
# MOCK SERVICES FIXTURES
# ================================================================
@pytest.fixture
async def mock_aemet_client(sample_weather_data):
"""Mock AEMET weather API client"""
with patch('app.external.aemet.AEMETClient') as mock_class:
mock_instance = AsyncMock()
mock_class.return_value = mock_instance
# Configure mock responses
mock_instance.get_historical_weather.return_value = sample_weather_data
mock_instance.get_current_weather.return_value = sample_weather_data[-1]
mock_instance.get_weather_forecast.return_value = sample_weather_data[-7:]
yield mock_instance
@pytest.fixture
async def mock_madrid_client(sample_traffic_data):
"""Mock Madrid OpenData API client"""
with patch('app.external.madrid_opendata.MadridOpenDataClient') as mock_class:
mock_instance = AsyncMock()
mock_class.return_value = mock_instance
# Configure mock responses
mock_instance.get_historical_traffic.return_value = sample_traffic_data
mock_instance.get_current_traffic.return_value = sample_traffic_data[-1]
yield mock_instance
@pytest.fixture
async def mock_external_services(mock_aemet_client, mock_madrid_client):
"""Combined mock for all external services"""
return {
'aemet': mock_aemet_client,
'madrid': mock_madrid_client
}
# ================================================================
# ML COMPONENT FIXTURES
# ================================================================
@pytest.fixture
def mock_ml_trainer():
"""Mock ML trainer for testing"""
with patch('app.ml.trainer.BakeryMLTrainer') as mock_class:
mock_instance = AsyncMock()
mock_class.return_value = mock_instance
# Configure successful training responses
mock_instance.train_single_product.return_value = {
"status": "completed",
"model_id": "test_model_123",
"metrics": {
"mape": 25.5,
"rmse": 12.3,
"mae": 8.7,
"r2_score": 0.85
},
"training_duration": 45.2,
"data_points_used": 365
}
mock_instance.train_tenant_models.return_value = [
{
"product_name": "Pan Integral",
"model_id": "model_pan_integral_123",
"metrics": {"mape": 22.1, "rmse": 10.5, "mae": 7.8},
"training_completed": True
},
{
"product_name": "Croissant",
"model_id": "model_croissant_456",
"metrics": {"mape": 28.3, "rmse": 8.9, "mae": 6.2},
"training_completed": True
}
]
yield mock_instance
@pytest.fixture
def mock_data_processor():
"""Mock data processor for testing"""
with patch('app.ml.data_processor.BakeryDataProcessor') as mock_class:
mock_instance = AsyncMock()
mock_class.return_value = mock_instance
# Configure mock responses
mock_instance.validate_data_quality.return_value = {
"is_valid": True,
"data_points": 1000,
"missing_percentage": 2.5,
"issues": []
}
mock_instance.prepare_training_data.return_value = pd.DataFrame({
"ds": pd.date_range("2023-01-01", periods=365),
"y": np.random.randint(10, 100, 365),
"temperature": np.random.uniform(0, 35, 365),
"traffic_volume": np.random.randint(100, 2000, 365)
})
yield mock_instance
@pytest.fixture
def mock_data_service():
"""Mock data service for testing"""
from unittest.mock import Mock, AsyncMock
mock_service = Mock()
mock_service.get_sales_data = AsyncMock(return_value=[
{"date": "2024-01-01", "product_name": "Pan Integral", "quantity": 45},
{"date": "2024-01-02", "product_name": "Pan Integral", "quantity": 38}
])
mock_service.get_weather_data = AsyncMock(return_value=[
{"date": "2024-01-01", "temperature": 20.5, "humidity": 65}
])
mock_service.get_traffic_data = AsyncMock(return_value=[
{"date": "2024-01-01", "traffic_index": 0.7}
])
return mock_service
@pytest.fixture
def mock_prophet_manager():
"""Mock Prophet manager for testing"""
with patch('app.ml.prophet_manager.BakeryProphetManager') as mock_class:
mock_instance = AsyncMock()
mock_class.return_value = mock_instance
# Configure mock responses
mock_instance.train_model.return_value = {
"model": Mock(), # Mock Prophet model
"metrics": {
"mape": 23.7,
"rmse": 11.2,
"mae": 8.1
},
"cross_validation": {
"cv_mape_mean": 25.1,
"cv_mape_std": 3.2
}
}
mock_instance.generate_predictions.return_value = pd.DataFrame({
"ds": pd.date_range("2024-01-01", periods=30),
"yhat": np.random.uniform(20, 80, 30),
"yhat_lower": np.random.uniform(10, 60, 30),
"yhat_upper": np.random.uniform(30, 100, 30)
})
yield mock_instance
# ================================================================
# UTILITY FIXTURES
# ================================================================
@pytest.fixture
def temp_model_storage():
"""Temporary directory for model storage during tests"""
with tempfile.TemporaryDirectory() as temp_dir:
yield Path(temp_dir)
@pytest.fixture
def test_config():
"""Test configuration settings"""
return {
"MODEL_STORAGE_PATH": "/tmp/test_models",
"MAX_TRAINING_TIME_MINUTES": 5,
"MIN_TRAINING_DATA_DAYS": 7,
"PROPHET_SEASONALITY_MODE": "additive",
"INCLUDE_SPANISH_HOLIDAYS": True,
"ENABLE_SYNTHETIC_DATA": True
}
@pytest.fixture
def sample_training_request():
"""Sample training request for API tests"""
return {
"products": ["Pan Integral", "Croissant"],
"include_weather": True,
"include_traffic": True,
"config": {
"seasonality_mode": "additive",
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0,
"validation_enabled": True
}
}
@pytest.fixture
def sample_single_product_request():
"""Sample single product training request"""
return {
"product_name": "Pan Integral",
"include_weather": True,
"include_traffic": False,
"config": {
"seasonality_mode": "multiplicative",
"include_holidays": True,
"holiday_prior_scale": 15.0
}
}
# ================================================================
# HELPER FUNCTIONS
# ================================================================
def _is_spanish_holiday(date: datetime) -> bool:
"""Check if date is a Spanish holiday"""
spanish_holidays = [
(1, 1), # Año Nuevo
(1, 6), # Reyes Magos
(5, 1), # Día del Trabajo
(8, 15), # Asunción de la Virgen
(10, 12), # Fiesta Nacional de España
(11, 1), # Todos los Santos
(12, 6), # Día de la Constitución
(12, 8), # Inmaculada Concepción
(12, 25), # Navidad
]
return (date.month, date.day) in spanish_holidays
@pytest.fixture
def spanish_holidays_2023():
"""List of Spanish holidays for 2023"""
holidays = []
for month, day in [
(1, 1), (1, 6), (5, 1), (8, 15), (10, 12),
(11, 1), (12, 6), (12, 8), (12, 25)
]:
holidays.append(datetime(2023, month, day))
return holidays
# ================================================================
# PERFORMANCE TESTING FIXTURES
# ================================================================
@pytest.fixture
def large_dataset_for_performance():
"""Generate large dataset for performance testing"""
# Generate 2 years of data with 15 products
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 1, 1)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
products = [
"Pan Integral", "Pan Blanco", "Croissant", "Magdalenas",
"Empanadas", "Tarta Chocolate", "Roscon Reyes", "Palmeras",
"Donuts", "Berlinas", "Napolitanas", "Ensaimadas",
"Baguette", "Pan de Molde", "Bizcocho"
]
data = []
for date in date_range:
for product in products:
# Realistic sales with patterns
base_quantity = np.random.randint(5, 150)
# Seasonal patterns
if date.month in [12, 1]: # Winter/Holiday season
base_quantity *= 1.4
elif date.month in [6, 7, 8]: # Summer
base_quantity *= 0.8
# Weekly patterns
if date.weekday() >= 5: # Weekends
base_quantity *= 1.2
elif date.weekday() == 0: # Monday
base_quantity *= 0.7
# Add noise
quantity = max(1, int(base_quantity + np.random.normal(0, base_quantity * 0.1)))
data.append({
"date": date.strftime("%Y-%m-%d"),
"product": product,
"quantity": quantity,
"revenue": round(quantity * np.random.uniform(1.5, 8.0), 2),
"temperature": round(15 + 12 * np.sin((date.timetuple().tm_yday / 365) * 2 * np.pi) + np.random.normal(0, 3), 1),
"precipitation": max(0, np.random.exponential(0.8)),
"is_weekend": date.weekday() >= 5,
"is_holiday": _is_spanish_holiday(date)
})
return pd.DataFrame(data)
@pytest.fixture
def memory_monitor():
"""Memory monitoring utility for performance tests"""
import psutil
import gc
class MemoryMonitor:
def __init__(self):
self.process = psutil.Process()
self.snapshots = []
def snapshot(self, label: str):
gc.collect() # Force garbage collection
memory_mb = self.process.memory_info().rss / 1024 / 1024
self.snapshots.append({
'label': label,
'memory_mb': memory_mb,
'timestamp': datetime.now()
})
return memory_mb
def get_peak_usage(self):
if not self.snapshots:
return 0
return max(s['memory_mb'] for s in self.snapshots)
def get_usage_increase(self):
if len(self.snapshots) < 2:
return 0
return self.snapshots[-1]['memory_mb'] - self.snapshots[0]['memory_mb']
def report(self):
print("\n=== Memory Usage Report ===")
for snapshot in self.snapshots:
print(f"{snapshot['label']}: {snapshot['memory_mb']:.2f} MB")
print(f"Peak Usage: {self.get_peak_usage():.2f} MB")
print(f"Total Increase: {self.get_usage_increase():.2f} MB")
return MemoryMonitor()
@pytest.fixture
def timing_monitor():
"""Timing monitoring utility for performance tests"""
import time
class TimingMonitor:
def __init__(self):
self.timings = []
self.start_time = None
def start(self, label: str):
self.start_time = time.time()
self.current_label = label
def stop(self):
if self.start_time is None:
return 0
duration = time.time() - self.start_time
self.timings.append({
'label': self.current_label,
'duration': duration
})
self.start_time = None
return duration
def get_total_time(self):
return sum(t['duration'] for t in self.timings)
def report(self):
print("\n=== Timing Report ===")
for timing in self.timings:
print(f"{timing['label']}: {timing['duration']:.2f}s")
print(f"Total Time: {self.get_total_time():.2f}s")
return TimingMonitor()
# ================================================================
# ADDITIONAL FIXTURES FOR COMPREHENSIVE TESTING
# ================================================================
@pytest.fixture
def mock_job_scheduler():
"""Mock job scheduler for testing"""
with patch('app.services.job_scheduler.JobScheduler') as mock_scheduler:
mock_instance = Mock()
mock_scheduler.return_value = mock_instance
mock_instance.schedule_job.return_value = "scheduled_job_123"
mock_instance.cancel_job.return_value = True
mock_instance.get_job_status.return_value = "running"
yield mock_instance
@pytest.fixture
def sample_model_metadata():
"""Sample model metadata for testing"""
return {
"model_id": "test_model_123",
"tenant_id": "test_tenant",
"product_name": "Pan Integral",
"model_type": "prophet",
"training_date": datetime.now().isoformat(),
"data_points_used": 365,
"features_used": ["temperature", "is_weekend", "is_holiday"],
"metrics": {
"mape": 23.5,
"rmse": 12.3,
"mae": 8.7,
"r2_score": 0.85
},
"hyperparameters": {
"seasonality_mode": "additive",
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0
},
"version": "1.0",
"status": "active"
}
@pytest.fixture
def training_progress_states():
"""Different training progress states for testing"""
return [
{"status": "pending", "progress": 0, "current_step": "Initializing training job"},
{"status": "running", "progress": 10, "current_step": "Fetching sales data"},
{"status": "running", "progress": 25, "current_step": "Processing weather data"},
{"status": "running", "progress": 40, "current_step": "Processing traffic data"},
{"status": "running", "progress": 55, "current_step": "Engineering features"},
{"status": "running", "progress": 70, "current_step": "Training Pan Integral model"},
{"status": "running", "progress": 85, "current_step": "Validating model performance"},
{"status": "running", "progress": 95, "current_step": "Saving model artifacts"},
{"status": "completed", "progress": 100, "current_step": "Training completed successfully"}
]
@pytest.fixture
def error_scenarios():
"""Different error scenarios for testing"""
return {
"insufficient_data": {
"error_type": "DataError",
"error_message": "Insufficient training data: only 15 days available, minimum 30 required",
"error_code": "INSUFFICIENT_DATA"
},
"external_api_failure": {
"error_type": "ExternalAPIError",
"error_message": "Failed to fetch weather data from AEMET API",
"error_code": "WEATHER_API_ERROR"
},
"model_training_failure": {
"error_type": "ModelTrainingError",
"error_message": "Prophet model training failed: unable to fit data",
"error_code": "MODEL_TRAINING_FAILED"
},
"data_quality_error": {
"error_type": "DataQualityError",
"error_message": "Data quality issues detected: 45% missing values in quantity column",
"error_code": "DATA_QUALITY_POOR"
}
}
@pytest.fixture
def performance_benchmarks():
"""Performance benchmarks for testing"""
return {
"single_product_training": {
"max_duration_seconds": 120,
"max_memory_mb": 500,
"min_accuracy_mape": 50
},
"multi_product_training": {
"max_duration_seconds": 300,
"max_memory_mb": 1000,
"min_accuracy_mape": 55
},
"data_processing": {
"max_throughput_rows_per_second": 1000,
"max_memory_per_1k_rows_mb": 10
},
"concurrent_jobs": {
"max_concurrent_jobs": 5,
"max_queue_time_seconds": 30
}
}
@pytest.fixture
def mock_model_storage():
"""Mock model storage system for testing"""
storage = {}
class MockModelStorage:
def save_model(self, model_id: str, model_data: Any, metadata: Dict[str, Any]):
storage[model_id] = {
"model_data": model_data,
"metadata": metadata,
"saved_at": datetime.now()
}
return f"/models/{model_id}.pkl"
def load_model(self, model_id: str):
if model_id in storage:
return storage[model_id]["model_data"]
raise FileNotFoundError(f"Model {model_id} not found")
def get_metadata(self, model_id: str):
if model_id in storage:
return storage[model_id]["metadata"]
raise FileNotFoundError(f"Model {model_id} not found")
def delete_model(self, model_id: str):
if model_id in storage:
del storage[model_id]
return True
return False
def list_models(self, tenant_id: str = None):
models = []
for model_id, data in storage.items():
if tenant_id is None or data["metadata"].get("tenant_id") == tenant_id:
models.append({
"model_id": model_id,
"metadata": data["metadata"],
"saved_at": data["saved_at"]
})
return models
return MockModelStorage()
@pytest.fixture
def real_world_scenarios():
"""Real-world bakery scenarios for testing"""
return {
"holiday_rush": {
"description": "Christmas season with high demand for seasonal products",
"date_range": ("2023-12-15", "2023-12-31"),
"expected_patterns": {
"Roscon Reyes": {"multiplier": 5.0, "trend": "increasing"},
"Pan Integral": {"multiplier": 1.3, "trend": "stable"},
"Tarta Chocolate": {"multiplier": 2.0, "trend": "increasing"}
}
},
"summer_slowdown": {
"description": "Summer period with generally lower sales",
"date_range": ("2023-07-01", "2023-08-31"),
"expected_patterns": {
"Pan Integral": {"multiplier": 0.8, "trend": "decreasing"},
"Croissant": {"multiplier": 0.9, "trend": "stable"},
"Cold_drinks": {"multiplier": 1.5, "trend": "increasing"}
}
},
"weekend_patterns": {
"description": "Weekend shopping patterns",
"expected_patterns": {
"weekend_boost": 1.2,
"peak_hours": ["10:00", "11:00", "18:00", "19:00"],
"popular_products": ["Croissant", "Palmeras", "Tarta Chocolate"]
}
},
"weather_impact": {
"description": "Weather impact on sales",
"scenarios": {
"rainy_day": {"bread_sales": 1.1, "pastry_sales": 0.9},
"hot_day": {"bread_sales": 0.8, "cold_items": 1.3},
"cold_day": {"bread_sales": 1.2, "hot_items": 1.4}
}
}
}
@pytest.fixture
def data_quality_test_cases():
"""Various data quality test cases"""
return {
"missing_values": {
"quantity_missing_5pct": 0.05,
"quantity_missing_20pct": 0.20,
"quantity_missing_50pct": 0.50,
"revenue_missing_10pct": 0.10
},
"outliers": {
"extreme_high": 100, # 100x normal values
"extreme_low": 0.01, # Near-zero values
"negative_values": -1,
"outlier_percentage": 0.01
},
"inconsistencies": {
"future_dates": ["2025-12-31", "2026-01-01"],
"invalid_dates": ["2023-13-01", "2023-02-30"],
"mismatched_revenue": True, # Revenue doesn't match quantity * price
"duplicate_records": True
},
"insufficient_data": {
"too_few_days": 10,
"too_few_products": 1,
"sporadic_data": 0.3 # Only 30% of expected data points
}
}
@pytest.fixture
def api_test_scenarios():
"""API testing scenarios"""
return {
"authentication": {
"valid_token": "Bearer valid_test_token_123",
"invalid_token": "Bearer invalid_token",
"expired_token": "Bearer expired_token_456",
"missing_token": None
},
"request_validation": {
"valid_request": {
"products": ["Pan Integral"],
"include_weather": True,
"include_traffic": True,
"config": {"seasonality_mode": "additive"}
},
"invalid_products": {
"products": [], # Empty products list
"include_weather": True
},
"invalid_config": {
"products": ["Pan Integral"],
"config": {"seasonality_mode": "invalid_mode"}
},
"missing_required_fields": {
"include_weather": True # Missing products
}
},
"rate_limiting": {
"max_requests_per_minute": 60,
"burst_requests": 100
}
}
@pytest.fixture
def integration_test_dependencies():
"""Dependencies for integration testing"""
class IntegrationDependencies:
def __init__(self):
self.external_services = {}
self.databases = {}
self.message_queues = {}
self.storage_systems = {}
def register_external_service(self, name: str, mock_instance):
self.external_services[name] = mock_instance
def register_database(self, name: str, mock_session):
self.databases[name] = mock_session
def register_message_queue(self, name: str, mock_queue):
self.message_queues[name] = mock_queue
def register_storage(self, name: str, mock_storage):
self.storage_systems[name] = mock_storage
def get_service(self, name: str):
return self.external_services.get(name)
def get_database(self, name: str):
return self.databases.get(name)
def are_all_services_healthy(self):
# Mock health check for all registered services
return len(self.external_services) > 0
return IntegrationDependencies()
@pytest.fixture
def load_test_configuration():
"""Configuration for load testing"""
return {
"concurrent_users": {
"light_load": 5,
"medium_load": 15,
"heavy_load": 30,
"stress_load": 50
},
"test_duration": {
"quick_test": 60, # 1 minute
"standard_test": 300, # 5 minutes
"extended_test": 900 # 15 minutes
},
"request_patterns": {
"constant_rate": "steady",
"ramp_up": "increasing",
"spike": "burst",
"random": "variable"
},
"success_criteria": {
"min_success_rate": 0.95,
"max_response_time": 30.0, # seconds
"max_error_rate": 0.05
}
}
@pytest.fixture
def mock_notification_system():
"""Mock notification system for testing"""
notifications_sent = []
class MockNotificationSystem:
def send_training_started(self, tenant_id: str, job_id: str, products: List[str]):
notification = {
"type": "training_started",
"tenant_id": tenant_id,
"job_id": job_id,
"products": products,
"timestamp": datetime.now()
}
notifications_sent.append(notification)
return notification
def send_training_completed(self, tenant_id: str, job_id: str, results: Dict[str, Any]):
notification = {
"type": "training_completed",
"tenant_id": tenant_id,
"job_id": job_id,
"results": results,
"timestamp": datetime.now()
}
notifications_sent.append(notification)
return notification
def send_training_failed(self, tenant_id: str, job_id: str, error: str):
notification = {
"type": "training_failed",
"tenant_id": tenant_id,
"job_id": job_id,
"error": error,
"timestamp": datetime.now()
}
notifications_sent.append(notification)
return notification
def get_notifications(self, tenant_id: str = None):
if tenant_id:
return [n for n in notifications_sent if n["tenant_id"] == tenant_id]
return notifications_sent
def clear_notifications(self):
notifications_sent.clear()
return MockNotificationSystem()
@pytest.fixture
def test_metrics_collector():
"""Test metrics collector for monitoring test performance"""
metrics = {}
class TestMetricsCollector:
def __init__(self):
self.start_times = {}
self.counters = {}
self.gauges = {}
self.histograms = {}
def start_timer(self, metric_name: str):
self.start_times[metric_name] = time.time()
def end_timer(self, metric_name: str):
if metric_name in self.start_times:
duration = time.time() - self.start_times[metric_name]
if metric_name not in self.histograms:
self.histograms[metric_name] = []
self.histograms[metric_name].append(duration)
del self.start_times[metric_name]
return duration
return 0
def increment_counter(self, counter_name: str, value: int = 1):
self.counters[counter_name] = self.counters.get(counter_name, 0) + value
def set_gauge(self, gauge_name: str, value: float):
self.gauges[gauge_name] = value
def get_counter(self, counter_name: str):
return self.counters.get(counter_name, 0)
def get_gauge(self, gauge_name: str):
return self.gauges.get(gauge_name, 0)
def get_histogram_stats(self, histogram_name: str):
if histogram_name not in self.histograms:
return {}
values = self.histograms[histogram_name]
return {
"count": len(values),
"min": min(values) if values else 0,
"max": max(values) if values else 0,
"avg": sum(values) / len(values) if values else 0,
"p50": sorted(values)[len(values)//2] if values else 0,
"p95": sorted(values)[int(len(values)*0.95)] if values else 0,
"p99": sorted(values)[int(len(values)*0.99)] if values else 0
}
def get_all_metrics(self):
return {
"counters": self.counters,
"gauges": self.gauges,
"histograms": {name: self.get_histogram_stats(name) for name in self.histograms}
}
def reset(self):
self.start_times.clear()
self.counters.clear()
self.gauges.clear()
self.histograms.clear()
import time
return TestMetricsCollector()
# ================================================================
# PYTEST PLUGINS AND HOOKS
# ================================================================
def pytest_runtest_setup(item):
"""Setup before each test"""
# Add any pre-test setup logic here
pass
def pytest_runtest_teardown(item, nextitem):
"""Teardown after each test"""
# Add any post-test cleanup logic here
import gc
gc.collect() # Force garbage collection after each test
def pytest_sessionstart(session):
"""Called after the Session object has been created"""
print("\n" + "="*80)
print("TRAINING SERVICE TEST SESSION STARTING")
print("="*80)
def pytest_sessionfinish(session, exitstatus):
"""Called after whole test run finished"""
print("\n" + "="*80)
print("TRAINING SERVICE TEST SESSION FINISHED")
print(f"Exit Status: {exitstatus}")
print("="*80)
# ================================================================
# FINAL CONFIGURATION
# ================================================================
# Ensure numpy doesn't use too many threads during testing
import numpy as np
np.seterr(all='ignore') # Ignore numpy warnings during tests
# Configure pandas for testing
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)
# Set random seeds for reproducible tests
np.random.seed(42)
import random
random.seed(42)