Files
bakery-ia/services/training/tests/test_ml.py

513 lines
19 KiB
Python
Raw Normal View History

2025-07-19 16:59:37 +02:00
# services/training/tests/test_ml.py
"""
Tests for ML components: trainer, prophet_manager, and data_processor
"""
import pytest
import pandas as pd
import numpy as np
from unittest.mock import Mock, patch, AsyncMock
from datetime import datetime, timedelta
import os
import tempfile
from app.ml.trainer import BakeryMLTrainer
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.data_processor import BakeryDataProcessor
class TestBakeryDataProcessor:
"""Test the data processor component"""
@pytest.fixture
def data_processor(self):
return BakeryDataProcessor()
@pytest.fixture
def sample_sales_data(self):
"""Create sample sales data"""
dates = pd.date_range('2024-01-01', periods=60, freq='D')
return pd.DataFrame({
'date': dates,
'product_name': ['Pan Integral'] * 60,
'quantity': [45 + np.random.randint(-10, 11) for _ in range(60)]
})
@pytest.fixture
def sample_weather_data(self):
"""Create sample weather data"""
dates = pd.date_range('2024-01-01', periods=60, freq='D')
return pd.DataFrame({
'date': dates,
'temperature': [15 + 5 * np.sin(2 * np.pi * i / 365) + np.random.normal(0, 2) for i in range(60)],
'precipitation': [max(0, np.random.exponential(1)) for _ in range(60)],
'humidity': [60 + np.random.normal(0, 10) for _ in range(60)]
})
@pytest.fixture
def sample_traffic_data(self):
"""Create sample traffic data"""
dates = pd.date_range('2024-01-01', periods=60, freq='D')
return pd.DataFrame({
'date': dates,
'traffic_volume': [100 + np.random.normal(0, 20) for _ in range(60)]
})
@pytest.mark.asyncio
async def test_prepare_training_data_basic(
self,
data_processor,
sample_sales_data,
sample_weather_data,
sample_traffic_data
):
"""Test basic data preparation"""
result = await data_processor.prepare_training_data(
sales_data=sample_sales_data,
weather_data=sample_weather_data,
traffic_data=sample_traffic_data,
product_name="Pan Integral"
)
# Check result structure
assert isinstance(result, pd.DataFrame)
assert 'ds' in result.columns
assert 'y' in result.columns
assert len(result) > 0
# Check Prophet format
assert result['ds'].dtype == 'datetime64[ns]'
assert pd.api.types.is_numeric_dtype(result['y'])
# Check temporal features
temporal_features = ['day_of_week', 'is_weekend', 'month', 'is_holiday']
for feature in temporal_features:
assert feature in result.columns
# Check weather features
weather_features = ['temperature', 'precipitation', 'humidity']
for feature in weather_features:
assert feature in result.columns
# Check traffic features
assert 'traffic_volume' in result.columns
@pytest.mark.asyncio
async def test_prepare_training_data_empty_weather(
self,
data_processor,
sample_sales_data
):
"""Test data preparation with empty weather data"""
result = await data_processor.prepare_training_data(
sales_data=sample_sales_data,
weather_data=pd.DataFrame(),
traffic_data=pd.DataFrame(),
product_name="Pan Integral"
)
# Should still work with default values
assert isinstance(result, pd.DataFrame)
assert 'ds' in result.columns
assert 'y' in result.columns
# Should have default weather values
assert 'temperature' in result.columns
assert result['temperature'].iloc[0] == 15.0 # Default value
@pytest.mark.asyncio
async def test_prepare_prediction_features(self, data_processor):
"""Test preparation of prediction features"""
future_dates = pd.date_range('2024-02-01', periods=7, freq='D')
weather_forecast = pd.DataFrame({
'ds': future_dates,
'temperature': [18.0] * 7,
'precipitation': [0.0] * 7,
'humidity': [65.0] * 7
})
result = await data_processor.prepare_prediction_features(
future_dates=future_dates,
weather_forecast=weather_forecast,
traffic_forecast=pd.DataFrame()
)
assert isinstance(result, pd.DataFrame)
assert len(result) == 7
assert 'ds' in result.columns
# Check temporal features are added
assert 'day_of_week' in result.columns
assert 'is_weekend' in result.columns
# Check weather features
assert 'temperature' in result.columns
assert all(result['temperature'] == 18.0)
def test_add_temporal_features(self, data_processor):
"""Test temporal feature engineering"""
dates = pd.date_range('2024-01-01', periods=10, freq='D')
df = pd.DataFrame({'date': dates})
result = data_processor._add_temporal_features(df)
# Check temporal features
assert 'day_of_week' in result.columns
assert 'is_weekend' in result.columns
assert 'month' in result.columns
assert 'season' in result.columns
assert 'week_of_year' in result.columns
assert 'quarter' in result.columns
assert 'is_holiday' in result.columns
assert 'is_school_holiday' in result.columns
# Check weekend detection
# 2024-01-01 was a Monday (day_of_week = 0)
assert result.iloc[0]['day_of_week'] == 0
assert result.iloc[0]['is_weekend'] == 0
# 2024-01-06 was a Saturday (day_of_week = 5)
assert result.iloc[5]['day_of_week'] == 5
assert result.iloc[5]['is_weekend'] == 1
def test_spanish_holiday_detection(self, data_processor):
"""Test Spanish holiday detection"""
# Test known Spanish holidays
new_year = datetime(2024, 1, 1)
epiphany = datetime(2024, 1, 6)
labour_day = datetime(2024, 5, 1)
christmas = datetime(2024, 12, 25)
assert data_processor._is_spanish_holiday(new_year) == True
assert data_processor._is_spanish_holiday(epiphany) == True
assert data_processor._is_spanish_holiday(labour_day) == True
assert data_processor._is_spanish_holiday(christmas) == True
# Test non-holiday
regular_day = datetime(2024, 3, 15)
assert data_processor._is_spanish_holiday(regular_day) == False
@pytest.mark.asyncio
async def test_prepare_training_data_insufficient_data(self, data_processor):
"""Test handling of insufficient training data"""
# Create very small dataset
small_sales_data = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=5, freq='D'),
'product_name': ['Pan Integral'] * 5,
'quantity': [45, 50, 48, 52, 49]
})
with pytest.raises(Exception):
await data_processor.prepare_training_data(
sales_data=small_sales_data,
weather_data=pd.DataFrame(),
traffic_data=pd.DataFrame(),
product_name="Pan Integral"
)
class TestBakeryProphetManager:
"""Test the Prophet manager component"""
@pytest.fixture
def prophet_manager(self):
with patch('app.ml.prophet_manager.settings.MODEL_STORAGE_PATH', '/tmp/test_models'):
os.makedirs('/tmp/test_models', exist_ok=True)
return BakeryProphetManager()
@pytest.fixture
def sample_prophet_data(self):
"""Create sample data in Prophet format"""
dates = pd.date_range('2024-01-01', periods=100, freq='D')
return pd.DataFrame({
'ds': dates,
'y': [45 + 10 * np.sin(2 * np.pi * i / 7) + np.random.normal(0, 5) for i in range(100)],
'temperature': [15 + 5 * np.sin(2 * np.pi * i / 365) for i in range(100)],
'humidity': [60 + np.random.normal(0, 10) for _ in range(100)]
})
@pytest.mark.asyncio
async def test_train_bakery_model_success(self, prophet_manager, sample_prophet_data):
"""Test successful model training"""
with patch('prophet.Prophet') as mock_prophet_class:
mock_model = Mock()
mock_model.fit.return_value = None
mock_prophet_class.return_value = mock_model
with patch('joblib.dump') as mock_dump:
result = await prophet_manager.train_bakery_model(
tenant_id="test-tenant",
product_name="Pan Integral",
df=sample_prophet_data,
job_id="test-job-123"
)
# Check result structure
assert isinstance(result, dict)
assert 'model_id' in result
assert 'model_path' in result
assert 'type' in result
assert result['type'] == 'prophet'
assert 'training_samples' in result
assert 'features' in result
assert 'training_metrics' in result
# Check that model was fitted
mock_model.fit.assert_called_once()
mock_dump.assert_called_once()
@pytest.mark.asyncio
async def test_validate_training_data_valid(self, prophet_manager, sample_prophet_data):
"""Test validation with valid data"""
# Should not raise exception
await prophet_manager._validate_training_data(sample_prophet_data, "Pan Integral")
@pytest.mark.asyncio
async def test_validate_training_data_insufficient(self, prophet_manager):
"""Test validation with insufficient data"""
small_data = pd.DataFrame({
'ds': pd.date_range('2024-01-01', periods=5, freq='D'),
'y': [45, 50, 48, 52, 49]
})
with pytest.raises(ValueError, match="Insufficient training data"):
await prophet_manager._validate_training_data(small_data, "Pan Integral")
@pytest.mark.asyncio
async def test_validate_training_data_missing_columns(self, prophet_manager):
"""Test validation with missing required columns"""
invalid_data = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=50, freq='D'),
'quantity': [45] * 50
})
with pytest.raises(ValueError, match="Missing required columns"):
await prophet_manager._validate_training_data(invalid_data, "Pan Integral")
def test_get_spanish_holidays(self, prophet_manager):
"""Test Spanish holidays creation"""
holidays = prophet_manager._get_spanish_holidays()
if not holidays.empty:
assert 'holiday' in holidays.columns
assert 'ds' in holidays.columns
# Check some known holidays exist
holiday_names = holidays['holiday'].unique()
expected_holidays = ['new_year', 'christmas', 'may_day']
for holiday in expected_holidays:
assert holiday in holiday_names
def test_extract_regressor_columns(self, prophet_manager, sample_prophet_data):
"""Test regressor column extraction"""
regressors = prophet_manager._extract_regressor_columns(sample_prophet_data)
assert isinstance(regressors, list)
assert 'temperature' in regressors
assert 'humidity' in regressors
assert 'ds' not in regressors # Should be excluded
assert 'y' not in regressors # Should be excluded
@pytest.mark.asyncio
async def test_generate_forecast(self, prophet_manager):
"""Test forecast generation"""
# Create a temporary model file
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as temp_file:
model_path = temp_file.name
try:
# Mock a saved model
with patch('joblib.load') as mock_load:
mock_model = Mock()
mock_forecast = pd.DataFrame({
'ds': pd.date_range('2024-02-01', periods=7, freq='D'),
'yhat': [50.0] * 7,
'yhat_lower': [45.0] * 7,
'yhat_upper': [55.0] * 7
})
mock_model.predict.return_value = mock_forecast
mock_load.return_value = mock_model
future_data = pd.DataFrame({
'ds': pd.date_range('2024-02-01', periods=7, freq='D'),
'temperature': [18.0] * 7,
'humidity': [65.0] * 7
})
result = await prophet_manager.generate_forecast(
model_path=model_path,
future_dates=future_data,
regressor_columns=['temperature', 'humidity']
)
assert isinstance(result, pd.DataFrame)
assert len(result) == 7
mock_model.predict.assert_called_once()
finally:
# Cleanup
try:
os.unlink(model_path)
except FileNotFoundError:
pass
class TestBakeryMLTrainer:
"""Test the ML trainer component"""
@pytest.fixture
def ml_trainer(self, mock_prophet_manager, mock_data_processor):
return BakeryMLTrainer()
@pytest.fixture
def sample_sales_data(self):
"""Sample sales data for training"""
return [
{"date": "2024-01-01", "product_name": "Pan Integral", "quantity": 45},
{"date": "2024-01-02", "product_name": "Pan Integral", "quantity": 50},
{"date": "2024-01-03", "product_name": "Pan Integral", "quantity": 48},
{"date": "2024-01-04", "product_name": "Croissant", "quantity": 25},
{"date": "2024-01-05", "product_name": "Croissant", "quantity": 30}
]
@pytest.mark.asyncio
async def test_train_tenant_models_success(
self,
ml_trainer,
sample_sales_data,
mock_prophet_manager,
mock_data_processor
):
"""Test successful training of tenant models"""
result = await ml_trainer.train_tenant_models(
tenant_id="test-tenant",
sales_data=sample_sales_data,
weather_data=[],
traffic_data=[],
job_id="test-job-123"
)
# Check result structure
assert isinstance(result, dict)
assert 'job_id' in result
assert 'tenant_id' in result
assert 'status' in result
assert 'training_results' in result
assert 'summary' in result
assert result['status'] == 'completed'
assert result['tenant_id'] == 'test-tenant'
@pytest.mark.asyncio
async def test_train_single_product_success(
self,
ml_trainer,
sample_sales_data,
mock_prophet_manager,
mock_data_processor
):
"""Test successful single product training"""
product_sales = [item for item in sample_sales_data if item['product_name'] == 'Pan Integral']
result = await ml_trainer.train_single_product(
tenant_id="test-tenant",
product_name="Pan Integral",
sales_data=product_sales,
weather_data=[],
traffic_data=[],
job_id="test-job-123"
)
# Check result structure
assert isinstance(result, dict)
assert 'job_id' in result
assert 'tenant_id' in result
assert 'product_name' in result
assert 'status' in result
assert 'model_info' in result
assert result['status'] == 'success'
assert result['product_name'] == 'Pan Integral'
@pytest.mark.asyncio
async def test_train_single_product_no_data(self, ml_trainer):
"""Test single product training with no data"""
with pytest.raises(ValueError, match="No sales data found"):
await ml_trainer.train_single_product(
tenant_id="test-tenant",
product_name="Nonexistent Product",
sales_data=[],
weather_data=[],
traffic_data=[],
job_id="test-job-123"
)
@pytest.mark.asyncio
async def test_validate_input_data_valid(self, ml_trainer, sample_sales_data):
"""Test input data validation with valid data"""
df = pd.DataFrame(sample_sales_data)
# Should not raise exception
await ml_trainer._validate_input_data(df, "test-tenant")
@pytest.mark.asyncio
async def test_validate_input_data_empty(self, ml_trainer):
"""Test input data validation with empty data"""
empty_df = pd.DataFrame()
with pytest.raises(ValueError, match="No sales data provided"):
await ml_trainer._validate_input_data(empty_df, "test-tenant")
@pytest.mark.asyncio
async def test_validate_input_data_missing_columns(self, ml_trainer):
"""Test input data validation with missing columns"""
invalid_df = pd.DataFrame([
{"invalid_column": "value1"},
{"invalid_column": "value2"}
])
with pytest.raises(ValueError, match="Missing required columns"):
await ml_trainer._validate_input_data(invalid_df, "test-tenant")
def test_calculate_training_summary(self, ml_trainer):
"""Test training summary calculation"""
training_results = {
"Pan Integral": {
"status": "success",
"model_info": {"training_metrics": {"mae": 5.0, "rmse": 7.0}}
},
"Croissant": {
"status": "error",
"error_message": "Insufficient data"
},
"Baguette": {
"status": "skipped",
"reason": "insufficient_data"
}
}
summary = ml_trainer._calculate_training_summary(training_results)
assert summary['total_products'] == 3
assert summary['successful_products'] == 1
assert summary['failed_products'] == 1
assert summary['skipped_products'] == 1
assert summary['success_rate'] == 33.33 # 1/3 * 100
class TestIntegrationML:
"""Integration tests for ML components working together"""
@pytest.mark.asyncio
async def test_end_to_end_training_flow(self):
"""Test complete training flow from data to model"""
# This test would require actual Prophet and data processing
# Skip for now due to dependencies
pytest.skip("Requires actual Prophet dependencies for integration test")
@pytest.mark.asyncio
async def test_data_pipeline_integration(self):
"""Test data processor -> prophet manager integration"""
pytest.skip("Requires actual dependencies for integration test")