# services/training/tests/test_ml.py """ Tests for ML components: trainer, prophet_manager, and data_processor """ import pytest import pandas as pd import numpy as np from unittest.mock import Mock, patch, AsyncMock from datetime import datetime, timedelta import os import tempfile from app.ml.trainer import BakeryMLTrainer from app.ml.prophet_manager import BakeryProphetManager from app.ml.data_processor import BakeryDataProcessor class TestBakeryDataProcessor: """Test the data processor component""" @pytest.fixture def data_processor(self): return BakeryDataProcessor() @pytest.fixture def sample_sales_data(self): """Create sample sales data""" dates = pd.date_range('2024-01-01', periods=60, freq='D') return pd.DataFrame({ 'date': dates, 'product_name': ['Pan Integral'] * 60, 'quantity': [45 + np.random.randint(-10, 11) for _ in range(60)] }) @pytest.fixture def sample_weather_data(self): """Create sample weather data""" dates = pd.date_range('2024-01-01', periods=60, freq='D') return pd.DataFrame({ 'date': dates, 'temperature': [15 + 5 * np.sin(2 * np.pi * i / 365) + np.random.normal(0, 2) for i in range(60)], 'precipitation': [max(0, np.random.exponential(1)) for _ in range(60)], 'humidity': [60 + np.random.normal(0, 10) for _ in range(60)] }) @pytest.fixture def sample_traffic_data(self): """Create sample traffic data""" dates = pd.date_range('2024-01-01', periods=60, freq='D') return pd.DataFrame({ 'date': dates, 'traffic_volume': [100 + np.random.normal(0, 20) for _ in range(60)] }) @pytest.mark.asyncio async def test_prepare_training_data_basic( self, data_processor, sample_sales_data, sample_weather_data, sample_traffic_data ): """Test basic data preparation""" result = await data_processor.prepare_training_data( sales_data=sample_sales_data, weather_data=sample_weather_data, traffic_data=sample_traffic_data, product_name="Pan Integral" ) # Check result structure assert isinstance(result, pd.DataFrame) assert 'ds' in result.columns assert 'y' in result.columns assert len(result) > 0 # Check Prophet format assert result['ds'].dtype == 'datetime64[ns]' assert pd.api.types.is_numeric_dtype(result['y']) # Check temporal features temporal_features = ['day_of_week', 'is_weekend', 'month', 'is_holiday'] for feature in temporal_features: assert feature in result.columns # Check weather features weather_features = ['temperature', 'precipitation', 'humidity'] for feature in weather_features: assert feature in result.columns # Check traffic features assert 'traffic_volume' in result.columns @pytest.mark.asyncio async def test_prepare_training_data_empty_weather( self, data_processor, sample_sales_data ): """Test data preparation with empty weather data""" result = await data_processor.prepare_training_data( sales_data=sample_sales_data, weather_data=pd.DataFrame(), traffic_data=pd.DataFrame(), product_name="Pan Integral" ) # Should still work with default values assert isinstance(result, pd.DataFrame) assert 'ds' in result.columns assert 'y' in result.columns # Should have default weather values assert 'temperature' in result.columns assert result['temperature'].iloc[0] == 15.0 # Default value @pytest.mark.asyncio async def test_prepare_prediction_features(self, data_processor): """Test preparation of prediction features""" future_dates = pd.date_range('2024-02-01', periods=7, freq='D') weather_forecast = pd.DataFrame({ 'ds': future_dates, 'temperature': [18.0] * 7, 'precipitation': [0.0] * 7, 'humidity': [65.0] * 7 }) result = await data_processor.prepare_prediction_features( future_dates=future_dates, weather_forecast=weather_forecast, traffic_forecast=pd.DataFrame() ) assert isinstance(result, pd.DataFrame) assert len(result) == 7 assert 'ds' in result.columns # Check temporal features are added assert 'day_of_week' in result.columns assert 'is_weekend' in result.columns # Check weather features assert 'temperature' in result.columns assert all(result['temperature'] == 18.0) def test_add_temporal_features(self, data_processor): """Test temporal feature engineering""" dates = pd.date_range('2024-01-01', periods=10, freq='D') df = pd.DataFrame({'date': dates}) result = data_processor._add_temporal_features(df) # Check temporal features assert 'day_of_week' in result.columns assert 'is_weekend' in result.columns assert 'month' in result.columns assert 'season' in result.columns assert 'week_of_year' in result.columns assert 'quarter' in result.columns assert 'is_holiday' in result.columns assert 'is_school_holiday' in result.columns # Check weekend detection # 2024-01-01 was a Monday (day_of_week = 0) assert result.iloc[0]['day_of_week'] == 0 assert result.iloc[0]['is_weekend'] == 0 # 2024-01-06 was a Saturday (day_of_week = 5) assert result.iloc[5]['day_of_week'] == 5 assert result.iloc[5]['is_weekend'] == 1 def test_spanish_holiday_detection(self, data_processor): """Test Spanish holiday detection""" # Test known Spanish holidays new_year = datetime(2024, 1, 1) epiphany = datetime(2024, 1, 6) labour_day = datetime(2024, 5, 1) christmas = datetime(2024, 12, 25) assert data_processor._is_spanish_holiday(new_year) == True assert data_processor._is_spanish_holiday(epiphany) == True assert data_processor._is_spanish_holiday(labour_day) == True assert data_processor._is_spanish_holiday(christmas) == True # Test non-holiday regular_day = datetime(2024, 3, 15) assert data_processor._is_spanish_holiday(regular_day) == False @pytest.mark.asyncio async def test_prepare_training_data_insufficient_data(self, data_processor): """Test handling of insufficient training data""" # Create very small dataset small_sales_data = pd.DataFrame({ 'date': pd.date_range('2024-01-01', periods=5, freq='D'), 'product_name': ['Pan Integral'] * 5, 'quantity': [45, 50, 48, 52, 49] }) with pytest.raises(Exception): await data_processor.prepare_training_data( sales_data=small_sales_data, weather_data=pd.DataFrame(), traffic_data=pd.DataFrame(), product_name="Pan Integral" ) class TestBakeryProphetManager: """Test the Prophet manager component""" @pytest.fixture def prophet_manager(self): with patch('app.ml.prophet_manager.settings.MODEL_STORAGE_PATH', '/tmp/test_models'): os.makedirs('/tmp/test_models', exist_ok=True) return BakeryProphetManager() @pytest.fixture def sample_prophet_data(self): """Create sample data in Prophet format""" dates = pd.date_range('2024-01-01', periods=100, freq='D') return pd.DataFrame({ 'ds': dates, 'y': [45 + 10 * np.sin(2 * np.pi * i / 7) + np.random.normal(0, 5) for i in range(100)], 'temperature': [15 + 5 * np.sin(2 * np.pi * i / 365) for i in range(100)], 'humidity': [60 + np.random.normal(0, 10) for _ in range(100)] }) @pytest.mark.asyncio async def test_train_bakery_model_success(self, prophet_manager, sample_prophet_data): """Test successful model training""" with patch('prophet.Prophet') as mock_prophet_class: mock_model = Mock() mock_model.fit.return_value = None mock_prophet_class.return_value = mock_model with patch('joblib.dump') as mock_dump: result = await prophet_manager.train_bakery_model( tenant_id="test-tenant", product_name="Pan Integral", df=sample_prophet_data, job_id="test-job-123" ) # Check result structure assert isinstance(result, dict) assert 'model_id' in result assert 'model_path' in result assert 'type' in result assert result['type'] == 'prophet' assert 'training_samples' in result assert 'features' in result assert 'training_metrics' in result # Check that model was fitted mock_model.fit.assert_called_once() mock_dump.assert_called_once() @pytest.mark.asyncio async def test_validate_training_data_valid(self, prophet_manager, sample_prophet_data): """Test validation with valid data""" # Should not raise exception await prophet_manager._validate_training_data(sample_prophet_data, "Pan Integral") @pytest.mark.asyncio async def test_validate_training_data_insufficient(self, prophet_manager): """Test validation with insufficient data""" small_data = pd.DataFrame({ 'ds': pd.date_range('2024-01-01', periods=5, freq='D'), 'y': [45, 50, 48, 52, 49] }) with pytest.raises(ValueError, match="Insufficient training data"): await prophet_manager._validate_training_data(small_data, "Pan Integral") @pytest.mark.asyncio async def test_validate_training_data_missing_columns(self, prophet_manager): """Test validation with missing required columns""" invalid_data = pd.DataFrame({ 'date': pd.date_range('2024-01-01', periods=50, freq='D'), 'quantity': [45] * 50 }) with pytest.raises(ValueError, match="Missing required columns"): await prophet_manager._validate_training_data(invalid_data, "Pan Integral") def test_get_spanish_holidays(self, prophet_manager): """Test Spanish holidays creation""" holidays = prophet_manager._get_spanish_holidays() if not holidays.empty: assert 'holiday' in holidays.columns assert 'ds' in holidays.columns # Check some known holidays exist holiday_names = holidays['holiday'].unique() expected_holidays = ['new_year', 'christmas', 'may_day'] for holiday in expected_holidays: assert holiday in holiday_names def test_extract_regressor_columns(self, prophet_manager, sample_prophet_data): """Test regressor column extraction""" regressors = prophet_manager._extract_regressor_columns(sample_prophet_data) assert isinstance(regressors, list) assert 'temperature' in regressors assert 'humidity' in regressors assert 'ds' not in regressors # Should be excluded assert 'y' not in regressors # Should be excluded @pytest.mark.asyncio async def test_generate_forecast(self, prophet_manager): """Test forecast generation""" # Create a temporary model file with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as temp_file: model_path = temp_file.name try: # Mock a saved model with patch('joblib.load') as mock_load: mock_model = Mock() mock_forecast = pd.DataFrame({ 'ds': pd.date_range('2024-02-01', periods=7, freq='D'), 'yhat': [50.0] * 7, 'yhat_lower': [45.0] * 7, 'yhat_upper': [55.0] * 7 }) mock_model.predict.return_value = mock_forecast mock_load.return_value = mock_model future_data = pd.DataFrame({ 'ds': pd.date_range('2024-02-01', periods=7, freq='D'), 'temperature': [18.0] * 7, 'humidity': [65.0] * 7 }) result = await prophet_manager.generate_forecast( model_path=model_path, future_dates=future_data, regressor_columns=['temperature', 'humidity'] ) assert isinstance(result, pd.DataFrame) assert len(result) == 7 mock_model.predict.assert_called_once() finally: # Cleanup try: os.unlink(model_path) except FileNotFoundError: pass class TestBakeryMLTrainer: """Test the ML trainer component""" @pytest.fixture def ml_trainer(self, mock_prophet_manager, mock_data_processor): return BakeryMLTrainer() @pytest.fixture def sample_sales_data(self): """Sample sales data for training""" return [ {"date": "2024-01-01", "product_name": "Pan Integral", "quantity": 45}, {"date": "2024-01-02", "product_name": "Pan Integral", "quantity": 50}, {"date": "2024-01-03", "product_name": "Pan Integral", "quantity": 48}, {"date": "2024-01-04", "product_name": "Croissant", "quantity": 25}, {"date": "2024-01-05", "product_name": "Croissant", "quantity": 30} ] @pytest.mark.asyncio async def test_train_tenant_models_success( self, ml_trainer, sample_sales_data, mock_prophet_manager, mock_data_processor ): """Test successful training of tenant models""" result = await ml_trainer.train_tenant_models( tenant_id="test-tenant", sales_data=sample_sales_data, weather_data=[], traffic_data=[], job_id="test-job-123" ) # Check result structure assert isinstance(result, dict) assert 'job_id' in result assert 'tenant_id' in result assert 'status' in result assert 'training_results' in result assert 'summary' in result assert result['status'] == 'completed' assert result['tenant_id'] == 'test-tenant' @pytest.mark.asyncio async def test_train_single_product_success( self, ml_trainer, sample_sales_data, mock_prophet_manager, mock_data_processor ): """Test successful single product training""" product_sales = [item for item in sample_sales_data if item['product_name'] == 'Pan Integral'] result = await ml_trainer.train_single_product( tenant_id="test-tenant", product_name="Pan Integral", sales_data=product_sales, weather_data=[], traffic_data=[], job_id="test-job-123" ) # Check result structure assert isinstance(result, dict) assert 'job_id' in result assert 'tenant_id' in result assert 'product_name' in result assert 'status' in result assert 'model_info' in result assert result['status'] == 'success' assert result['product_name'] == 'Pan Integral' @pytest.mark.asyncio async def test_train_single_product_no_data(self, ml_trainer): """Test single product training with no data""" with pytest.raises(ValueError, match="No sales data found"): await ml_trainer.train_single_product( tenant_id="test-tenant", product_name="Nonexistent Product", sales_data=[], weather_data=[], traffic_data=[], job_id="test-job-123" ) @pytest.mark.asyncio async def test_validate_input_data_valid(self, ml_trainer, sample_sales_data): """Test input data validation with valid data""" df = pd.DataFrame(sample_sales_data) # Should not raise exception await ml_trainer._validate_input_data(df, "test-tenant") @pytest.mark.asyncio async def test_validate_input_data_empty(self, ml_trainer): """Test input data validation with empty data""" empty_df = pd.DataFrame() with pytest.raises(ValueError, match="No sales data provided"): await ml_trainer._validate_input_data(empty_df, "test-tenant") @pytest.mark.asyncio async def test_validate_input_data_missing_columns(self, ml_trainer): """Test input data validation with missing columns""" invalid_df = pd.DataFrame([ {"invalid_column": "value1"}, {"invalid_column": "value2"} ]) with pytest.raises(ValueError, match="Missing required columns"): await ml_trainer._validate_input_data(invalid_df, "test-tenant") def test_calculate_training_summary(self, ml_trainer): """Test training summary calculation""" training_results = { "Pan Integral": { "status": "success", "model_info": {"training_metrics": {"mae": 5.0, "rmse": 7.0}} }, "Croissant": { "status": "error", "error_message": "Insufficient data" }, "Baguette": { "status": "skipped", "reason": "insufficient_data" } } summary = ml_trainer._calculate_training_summary(training_results) assert summary['total_products'] == 3 assert summary['successful_products'] == 1 assert summary['failed_products'] == 1 assert summary['skipped_products'] == 1 assert summary['success_rate'] == 33.33 # 1/3 * 100 class TestIntegrationML: """Integration tests for ML components working together""" @pytest.mark.asyncio async def test_end_to_end_training_flow(self): """Test complete training flow from data to model""" # This test would require actual Prophet and data processing # Skip for now due to dependencies pytest.skip("Requires actual Prophet dependencies for integration test") @pytest.mark.asyncio async def test_data_pipeline_integration(self): """Test data processor -> prophet manager integration""" pytest.skip("Requires actual dependencies for integration test")