bakery-ia/services/data/tests/test_madrid_opendata.py

#!/usr/bin/env python3
"""
Updated Madrid Historical Traffic test for pytest inside Docker
Configured for June 2025 data availability (last available historical data)
"""

import pytest
import asyncio
from datetime import datetime, timedelta
from typing import List, Dict, Any

# Import from the actual service
from app.external.madrid_opendata import MadridOpenDataClient
from app.core.config import settings
import structlog

# Configure pytest for async
pytestmark = pytest.mark.asyncio

# Use actual logger
logger = structlog.get_logger()


class TestMadridTrafficInside:
    """Test class for Madrid traffic functionality inside Docker"""
    
    @pytest.fixture
    def client(self):
        """Create Madrid client for testing"""
        return MadridOpenDataClient()
    
    @pytest.fixture
    def madrid_coords(self):
        """Madrid center coordinates"""
        return 40.4168, -3.7038
    
    @pytest.fixture
    def june_2025_dates(self):
        """Date ranges for June 2025 (last available historical data)"""
        return {
            "quick": {
                "start": datetime(2025, 6, 1, 0, 0),
                "end": datetime(2025, 6, 1, 6, 0)  # 6 hours on June 1st
            },
            "one_day": {
                "start": datetime(2025, 6, 15, 0, 0),  # Mid-June
                "end": datetime(2025, 6, 16, 0, 0)    # One full day
            },
            "three_days": {
                "start": datetime(2025, 6, 10, 0, 0),
                "end": datetime(2025, 6, 13, 0, 0)    # 3 days in June
            },
            "recent_synthetic": {
                "start": datetime.now() - timedelta(hours=6),
                "end": datetime.now()  # Recent data (will be synthetic)
            }
        }
    
    async def test_quick_historical_traffic_june2025(self, client, madrid_coords, june_2025_dates):
        """Test quick historical traffic data from June 2025"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["quick"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        
        print(f"\n=== Quick Test (June 2025 - 6 hours) ===")
        print(f"Location: {lat}, {lon}")
        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
        print(f"Note: Testing with June 2025 data (last available historical month)")
        
        # Test the function
        execution_start = datetime.now()
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        execution_time = (datetime.now() - execution_start).total_seconds()
        
        print(f"⏱️  Execution time: {execution_time:.2f} seconds")
        print(f"📊 Records returned: {len(result)}")
        
        # Assertions
        assert isinstance(result, list), "Result should be a list"
        assert len(result) > 0, "Should return at least some records"
        assert execution_time < 30, "Should execute in reasonable time (allowing for ZIP download)"
        
        # Check first record structure
        if result:
            sample = result[0]
            print(f"📋 Sample record keys: {list(sample.keys())}")
            print(f"📡 Data source: {sample.get('source', 'unknown')}")
            
            # Required fields
            required_fields = ['date', 'traffic_volume', 'congestion_level', 'average_speed', 'source']
            for field in required_fields:
                assert field in sample, f"Missing required field: {field}"
            
            # Data validation
            assert isinstance(sample['traffic_volume'], int), "Traffic volume should be int"
            assert 0 <= sample['traffic_volume'] <= 1000, "Traffic volume should be reasonable"
            assert sample['congestion_level'] in ['low', 'medium', 'high', 'blocked'], "Invalid congestion level"
            assert 5 <= sample['average_speed'] <= 100, "Speed should be reasonable"
            assert isinstance(sample['date'], datetime), "Date should be datetime object"
            
            # Check if we got real Madrid data or synthetic
            if sample['source'] == 'madrid_opendata_zip':
                print(f"🎉 SUCCESS: Got real Madrid historical data from ZIP!")
            else:
                print(f"ℹ️  Got synthetic data (real data may not be available)")
            
            print(f"✅ All validations passed")
    
    async def test_one_day_june2025(self, client, madrid_coords, june_2025_dates):
        """Test one day of June 2025 historical traffic data"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["one_day"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        
        print(f"\n=== One Day Test (June 15, 2025) ===")
        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
        
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        
        print(f"📊 Records returned: {len(result)}")
        
        # Should have roughly 24 records (one per hour)
        assert len(result) >= 20, "Should have at least 20 hourly records for one day"
        assert len(result) <= 30, "Should not have more than 30 records for one day"
        
        # Check data source
        if result:
            sources = set(r['source'] for r in result)
            print(f"📡 Data sources: {', '.join(sources)}")
            
            # If we got real data, check for realistic measurement point IDs
            real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
            if real_data_records:
                point_ids = set(r['measurement_point_id'] for r in real_data_records)
                print(f"🏷️  Real measurement points found: {len(point_ids)}")
                print(f"   Sample IDs: {list(point_ids)[:3]}")
        
        # Check traffic patterns
        if len(result) >= 24:
            # Find rush hour records (7-9 AM, 6-8 PM)
            rush_hour_records = [r for r in result if 7 <= r['date'].hour <= 9 or 18 <= r['date'].hour <= 20]
            night_records = [r for r in result if r['date'].hour <= 6 or r['date'].hour >= 22]
            
            if rush_hour_records and night_records:
                avg_rush_traffic = sum(r['traffic_volume'] for r in rush_hour_records) / len(rush_hour_records)
                avg_night_traffic = sum(r['traffic_volume'] for r in night_records) / len(night_records)
                
                print(f"📈 Rush hour avg traffic: {avg_rush_traffic:.1f}")
                print(f"🌙 Night avg traffic: {avg_night_traffic:.1f}")
                
                # Rush hour should typically have more traffic than night
                if avg_rush_traffic > avg_night_traffic:
                    print(f"✅ Traffic patterns look realistic")
                else:
                    print(f"⚠️  Traffic patterns unusual (not necessarily wrong)")
    
    async def test_three_days_june2025(self, client, madrid_coords, june_2025_dates):
        """Test three days of June 2025 historical traffic data"""
        lat, lon = madrid_coords
        date_range = june_2025_dates["three_days"]
        start_time = date_range["start"]
        end_time = date_range["end"]
        
        print(f"\n=== Three Days Test (June 10-13, 2025) ===")
        print(f"Date range: {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}")
        
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        
        print(f"📊 Records returned: {len(result)}")
        
        # Should have roughly 72 records (24 hours * 3 days)
        assert len(result) >= 60, "Should have at least 60 records for 3 days"
        assert len(result) <= 90, "Should not have more than 90 records for 3 days"
        
        # Check data sources
        sources = set(r['source'] for r in result)
        print(f"📡 Data sources: {', '.join(sources)}")
        
        # Calculate statistics
        traffic_volumes = [r['traffic_volume'] for r in result]
        speeds = [r['average_speed'] for r in result]
        
        avg_traffic = sum(traffic_volumes) / len(traffic_volumes)
        max_traffic = max(traffic_volumes)
        min_traffic = min(traffic_volumes)
        avg_speed = sum(speeds) / len(speeds)
        
        print(f"📈 Statistics:")
        print(f"   Average traffic: {avg_traffic:.1f}")
        print(f"   Max traffic: {max_traffic}")
        print(f"   Min traffic: {min_traffic}")
        print(f"   Average speed: {avg_speed:.1f} km/h")
        
        # Analyze by data source
        real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
        synthetic_records = [r for r in result if r['source'] != 'madrid_opendata_zip']
        
        print(f"🔍 Data breakdown:")
        print(f"   Real Madrid data: {len(real_data_records)} records")
        print(f"   Synthetic data: {len(synthetic_records)} records")
        
        if real_data_records:
            # Show measurement points from real data
            real_points = set(r['measurement_point_id'] for r in real_data_records)
            print(f"   Real measurement points: {len(real_points)}")
            
        # Sanity checks
        assert 10 <= avg_traffic <= 500, "Average traffic should be reasonable"
        assert 10 <= avg_speed <= 60, "Average speed should be reasonable"
        assert max_traffic >= avg_traffic, "Max should be >= average"
        assert min_traffic <= avg_traffic, "Min should be <= average"
    
    async def test_recent_vs_historical_data(self, client, madrid_coords, june_2025_dates):
        """Compare recent data (synthetic) vs June 2025 data (potentially real)"""
        lat, lon = madrid_coords
        
        print(f"\n=== Recent vs Historical Data Comparison ===")
        
        # Test recent data (should be synthetic)
        recent_range = june_2025_dates["recent_synthetic"]
        recent_result = await client.get_historical_traffic(
            lat, lon, recent_range["start"], recent_range["end"]
        )
        
        # Test June 2025 data (potentially real)
        june_range = june_2025_dates["quick"]
        june_result = await client.get_historical_traffic(
            lat, lon, june_range["start"], june_range["end"]
        )
        
        print(f"📊 Recent data: {len(recent_result)} records")
        print(f"📊 June 2025 data: {len(june_result)} records")
        
        if recent_result:
            recent_sources = set(r['source'] for r in recent_result)
            print(f"📡 Recent sources: {', '.join(recent_sources)}")
        
        if june_result:
            june_sources = set(r['source'] for r in june_result)
            print(f"📡 June sources: {', '.join(june_sources)}")
            
            # Check if we successfully got real data from June
            if 'madrid_opendata_zip' in june_sources:
                print(f"🎉 SUCCESS: Real Madrid data successfully fetched from June 2025!")
                
                # Show details of real data
                real_records = [r for r in june_result if r['source'] == 'madrid_opendata_zip']
                if real_records:
                    sample = real_records[0]
                    print(f"📋 Real data sample:")
                    print(f"   Date: {sample['date']}")
                    print(f"   Traffic volume: {sample['traffic_volume']}")
                    print(f"   Measurement point: {sample['measurement_point_id']}")
                    print(f"   Point name: {sample.get('measurement_point_name', 'N/A')}")
            else:
                print(f"ℹ️  June data is synthetic (real ZIP may not be accessible)")
    
    async def test_madrid_zip_month_code(self, client):
        """Test the month code calculation for Madrid ZIP files"""
        print(f"\n=== Madrid ZIP Month Code Test ===")
        
        # Test the month code calculation function
        test_cases = [
            (2025, 6, 145),  # Known: June 2025 = 145
            (2025, 5, 144),  # Known: May 2025 = 144
            (2025, 4, 143),  # Known: April 2025 = 143
            (2025, 7, 146),  # Predicted: July 2025 = 146
        ]
        
        for year, month, expected_code in test_cases:
            if hasattr(client, '_calculate_madrid_month_code'):
                calculated_code = client._calculate_madrid_month_code(year, month)
                status = "✅" if calculated_code == expected_code else "⚠️"
                print(f"{status} {year}-{month:02d}: Expected {expected_code}, Got {calculated_code}")
                
                # Generate ZIP URL
                if calculated_code:
                    zip_url = f"https://datos.madrid.es/egob/catalogo/208627-{calculated_code}-transporte-ptomedida-historico.zip"
                    print(f"   ZIP URL: {zip_url}")
            else:
                print(f"⚠️  Month code calculation function not available")
    
    async def test_edge_case_large_date_range(self, client, madrid_coords):
        """Test edge case: date range too large"""
        lat, lon = madrid_coords
        start_time = datetime(2025, 1, 1)  # 6+ months range
        end_time = datetime(2025, 7, 1)
        
        print(f"\n=== Edge Case: Large Date Range ===")
        print(f"Testing 6-month range: {start_time.date()} to {end_time.date()}")
        
        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
        
        print(f"📊 Records for 6-month range: {len(result)}")
        
        # Should return empty list for ranges > 90 days
        assert len(result) == 0, "Should return empty list for date ranges > 90 days"
        print(f"✅ Correctly handled large date range")
    
    async def test_edge_case_invalid_coordinates(self, client):
        """Test edge case: invalid coordinates"""
        print(f"\n=== Edge Case: Invalid Coordinates ===")
        
        start_time = datetime(2025, 6, 1)
        end_time = datetime(2025, 6, 1, 6, 0)
        
        # Test with invalid coordinates
        result = await client.get_historical_traffic(999.0, 999.0, start_time, end_time)
        
        print(f"📊 Records for invalid coords: {len(result)}")
        
        # Should either return empty list or synthetic data
        # The function should not crash
        assert isinstance(result, list), "Should return list even with invalid coords"
        print(f"✅ Handled invalid coordinates gracefully")
    
    async def test_real_madrid_zip_access(self, client):
        """Test if we can access the actual Madrid ZIP files"""
        print(f"\n=== Real Madrid ZIP Access Test ===")
        
        # Test the known ZIP URLs you provided
        test_urls = [
            "https://datos.madrid.es/egob/catalogo/208627-145-transporte-ptomedida-historico.zip",  # June 2025
            "https://datos.madrid.es/egob/catalogo/208627-144-transporte-ptomedida-historico.zip",  # May 2025
            "https://datos.madrid.es/egob/catalogo/208627-143-transporte-ptomedida-historico.zip",  # April 2025
        ]
        
        for i, url in enumerate(test_urls):
            month_name = ["June 2025", "May 2025", "April 2025"][i]
            print(f"\nTesting {month_name}: {url}")
            
            try:
                if hasattr(client, '_fetch_historical_zip'):
                    zip_data = await client._fetch_historical_zip(url)
                    if zip_data:
                        print(f"✅ Successfully fetched ZIP: {len(zip_data)} bytes")
                        
                        # Try to inspect ZIP contents
                        try:
                            import zipfile
                            from io import BytesIO
                            
                            with zipfile.ZipFile(BytesIO(zip_data), 'r') as zip_file:
                                files = zip_file.namelist()
                                csv_files = [f for f in files if f.endswith('.csv')]
                                print(f"📁 ZIP contains {len(files)} files, {len(csv_files)} CSV files")
                                
                                if csv_files:
                                    print(f"   CSV files: {csv_files[:2]}{'...' if len(csv_files) > 2 else ''}")
                                    
                        except Exception as e:
                            print(f"⚠️  Could not inspect ZIP contents: {e}")
                    else:
                        print(f"❌ Failed to fetch ZIP")
                else:
                    print(f"⚠️  ZIP fetch function not available")
                    
            except Exception as e:
                print(f"❌ Error testing ZIP access: {e}")


# Additional standalone test functions for manual running
async def run_manual_test():
    """Manual test function that can be run directly"""
    print("="*60)
    print("MADRID TRAFFIC TEST - JUNE 2025 DATA")
    print("="*60)
    
    client = MadridOpenDataClient()
    madrid_lat, madrid_lon = 40.4168, -3.7038
    
    # Test with June 2025 data (last available)
    start_time = datetime(2025, 6, 15, 14, 0)  # June 15, 2025 at 2 PM
    end_time = datetime(2025, 6, 15, 18, 0)    # Until 6 PM (4 hours)
    
    print(f"\nTesting June 15, 2025 data (2 PM - 6 PM)...")
    print(f"This should include afternoon traffic patterns")
    
    result = await client.get_historical_traffic(madrid_lat, madrid_lon, start_time, end_time)
    
    print(f"Result: {len(result)} records")
    
    if result:
        sources = set(r['source'] for r in result)
        print(f"Data sources: {', '.join(sources)}")
        
        if 'madrid_opendata_zip' in sources:
            print(f"🎉 Successfully got real Madrid data!")
        
        sample = result[0]
        print(f"\nSample record:")
        for key, value in sample.items():
            if key == "date":
                print(f"  {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
            else:
                print(f"  {key}: {value}")
    
    print(f"\n✅ Manual test completed!")


if __name__ == "__main__":
    # If run directly, execute manual test
    asyncio.run(run_manual_test())