REFACTOR - MAdrid Open data file

2025-07-24 08:43:54 +02:00
parent 31c30354bc
commit d8bc64eef3
4 changed files with 1473 additions and 741 deletions
--- a/services/data/app/external/madrid_opendata.py
+++ b/services/data/app/external/madrid_opendata.py
--- a/services/data/tests/conftest.py
+++ b/services/data/tests/conftest.py
@@ -16,67 +16,3 @@ from app.core.database import Base, get_db
 from app.models.sales import SalesData
 from app.models.weather import WeatherData, WeatherForecast
 from app.models.traffic import TrafficData
-
-# Test database URL
-TEST_DATABASE_URL = "sqlite+aiosqlite:///:memory:"
-
-# Create test engine
-test_engine = create_async_engine(
-    TEST_DATABASE_URL,
-    connect_args={"check_same_thread": False},
-    poolclass=StaticPool,
-)
-
-TestingSessionLocal = async_sessionmaker(
-    test_engine,
-    class_=AsyncSession,
-    expire_on_commit=False
-)
-
-@pytest_asyncio.fixture
-async def db():
-    """Create test database session"""
-    async with test_engine.begin() as conn:
-        await conn.run_sync(Base.metadata.create_all)
-    
-    async with TestingSessionLocal() as session:
-        yield session
-    
-    async with test_engine.begin() as conn:
-        await conn.run_sync(Base.metadata.drop_all)
-
-@pytest.fixture
-def client():
-    """Create test client"""
-    async def override_get_db():
-        async with TestingSessionLocal() as session:
-            yield session
-    
-    app.dependency_overrides[get_db] = override_get_db
-    
-    with TestClient(app) as test_client:
-        yield test_client
-    
-    app.dependency_overrides.clear()
-
-@pytest.fixture
-def test_tenant_id():
-    """Test tenant ID"""
-    return uuid.uuid4()
-
-@pytest.fixture
-def test_sales_data():
-    """Sample sales data for testing"""
-    return {
-        "date": datetime.now(),
-        "product_name": "Pan Integral",
-        "quantity_sold": 25,
-        "revenue": 37.50,
-        "location_id": "madrid_centro",
-        "source": "test"
-    }
-
-@pytest.fixture
-def mock_auth_token():
-    """Mock authentication token"""
-    return "Bearer test-token-123"
--- a/services/data/tests/pytest.ini
+++ b/services/data/tests/pytest.ini
@@ -0,0 +1,16 @@
+[tool:pytest]
+# pytest.ini - Configuration for async testing
+asyncio_mode = auto
+addopts = -v --tb=short --capture=no
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+markers =
+    asyncio: mark test as async
+    slow: mark test as slow
+    integration: mark test as integration test
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
+    ignore::PydanticDeprecatedSince20
--- a/services/data/tests/test_madrid_opendata.py
+++ b/services/data/tests/test_madrid_opendata.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""
+Updated Madrid Historical Traffic test for pytest inside Docker
+Configured for June 2025 data availability (last available historical data)
+"""
+
+import pytest
+import asyncio
+from datetime import datetime, timedelta
+from typing import List, Dict, Any
+
+# Import from the actual service
+from app.external.madrid_opendata import MadridOpenDataClient
+from app.core.config import settings
+import structlog
+
+# Configure pytest for async
+pytestmark = pytest.mark.asyncio
+
+# Use actual logger
+logger = structlog.get_logger()
+
+
+class TestMadridTrafficInside:
+    """Test class for Madrid traffic functionality inside Docker"""
+    
+    @pytest.fixture
+    def client(self):
+        """Create Madrid client for testing"""
+        return MadridOpenDataClient()
+    
+    @pytest.fixture
+    def madrid_coords(self):
+        """Madrid center coordinates"""
+        return 40.4168, -3.7038
+    
+    @pytest.fixture
+    def june_2025_dates(self):
+        """Date ranges for June 2025 (last available historical data)"""
+        return {
+            "quick": {
+                "start": datetime(2025, 6, 1, 0, 0),
+                "end": datetime(2025, 6, 1, 6, 0)  # 6 hours on June 1st
+            },
+            "one_day": {
+                "start": datetime(2025, 6, 15, 0, 0),  # Mid-June
+                "end": datetime(2025, 6, 16, 0, 0)    # One full day
+            },
+            "three_days": {
+                "start": datetime(2025, 6, 10, 0, 0),
+                "end": datetime(2025, 6, 13, 0, 0)    # 3 days in June
+            },
+            "recent_synthetic": {
+                "start": datetime.now() - timedelta(hours=6),
+                "end": datetime.now()  # Recent data (will be synthetic)
+            }
+        }
+    
+    async def test_quick_historical_traffic_june2025(self, client, madrid_coords, june_2025_dates):
+        """Test quick historical traffic data from June 2025"""
+        lat, lon = madrid_coords
+        date_range = june_2025_dates["quick"]
+        start_time = date_range["start"]
+        end_time = date_range["end"]
+        
+        print(f"\n=== Quick Test (June 2025 - 6 hours) ===")
+        print(f"Location: {lat}, {lon}")
+        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
+        print(f"Note: Testing with June 2025 data (last available historical month)")
+        
+        # Test the function
+        execution_start = datetime.now()
+        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
+        execution_time = (datetime.now() - execution_start).total_seconds()
+        
+        print(f"⏱️  Execution time: {execution_time:.2f} seconds")
+        print(f"📊 Records returned: {len(result)}")
+        
+        # Assertions
+        assert isinstance(result, list), "Result should be a list"
+        assert len(result) > 0, "Should return at least some records"
+        assert execution_time < 30, "Should execute in reasonable time (allowing for ZIP download)"
+        
+        # Check first record structure
+        if result:
+            sample = result[0]
+            print(f"📋 Sample record keys: {list(sample.keys())}")
+            print(f"📡 Data source: {sample.get('source', 'unknown')}")
+            
+            # Required fields
+            required_fields = ['date', 'traffic_volume', 'congestion_level', 'average_speed', 'source']
+            for field in required_fields:
+                assert field in sample, f"Missing required field: {field}"
+            
+            # Data validation
+            assert isinstance(sample['traffic_volume'], int), "Traffic volume should be int"
+            assert 0 <= sample['traffic_volume'] <= 1000, "Traffic volume should be reasonable"
+            assert sample['congestion_level'] in ['low', 'medium', 'high', 'blocked'], "Invalid congestion level"
+            assert 5 <= sample['average_speed'] <= 100, "Speed should be reasonable"
+            assert isinstance(sample['date'], datetime), "Date should be datetime object"
+            
+            # Check if we got real Madrid data or synthetic
+            if sample['source'] == 'madrid_opendata_zip':
+                print(f"🎉 SUCCESS: Got real Madrid historical data from ZIP!")
+            else:
+                print(f"ℹ️  Got synthetic data (real data may not be available)")
+            
+            print(f"✅ All validations passed")
+    
+    async def test_one_day_june2025(self, client, madrid_coords, june_2025_dates):
+        """Test one day of June 2025 historical traffic data"""
+        lat, lon = madrid_coords
+        date_range = june_2025_dates["one_day"]
+        start_time = date_range["start"]
+        end_time = date_range["end"]
+        
+        print(f"\n=== One Day Test (June 15, 2025) ===")
+        print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
+        
+        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
+        
+        print(f"📊 Records returned: {len(result)}")
+        
+        # Should have roughly 24 records (one per hour)
+        assert len(result) >= 20, "Should have at least 20 hourly records for one day"
+        assert len(result) <= 30, "Should not have more than 30 records for one day"
+        
+        # Check data source
+        if result:
+            sources = set(r['source'] for r in result)
+            print(f"📡 Data sources: {', '.join(sources)}")
+            
+            # If we got real data, check for realistic measurement point IDs
+            real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
+            if real_data_records:
+                point_ids = set(r['measurement_point_id'] for r in real_data_records)
+                print(f"🏷️  Real measurement points found: {len(point_ids)}")
+                print(f"   Sample IDs: {list(point_ids)[:3]}")
+        
+        # Check traffic patterns
+        if len(result) >= 24:
+            # Find rush hour records (7-9 AM, 6-8 PM)
+            rush_hour_records = [r for r in result if 7 <= r['date'].hour <= 9 or 18 <= r['date'].hour <= 20]
+            night_records = [r for r in result if r['date'].hour <= 6 or r['date'].hour >= 22]
+            
+            if rush_hour_records and night_records:
+                avg_rush_traffic = sum(r['traffic_volume'] for r in rush_hour_records) / len(rush_hour_records)
+                avg_night_traffic = sum(r['traffic_volume'] for r in night_records) / len(night_records)
+                
+                print(f"📈 Rush hour avg traffic: {avg_rush_traffic:.1f}")
+                print(f"🌙 Night avg traffic: {avg_night_traffic:.1f}")
+                
+                # Rush hour should typically have more traffic than night
+                if avg_rush_traffic > avg_night_traffic:
+                    print(f"✅ Traffic patterns look realistic")
+                else:
+                    print(f"⚠️  Traffic patterns unusual (not necessarily wrong)")
+    
+    async def test_three_days_june2025(self, client, madrid_coords, june_2025_dates):
+        """Test three days of June 2025 historical traffic data"""
+        lat, lon = madrid_coords
+        date_range = june_2025_dates["three_days"]
+        start_time = date_range["start"]
+        end_time = date_range["end"]
+        
+        print(f"\n=== Three Days Test (June 10-13, 2025) ===")
+        print(f"Date range: {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}")
+        
+        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
+        
+        print(f"📊 Records returned: {len(result)}")
+        
+        # Should have roughly 72 records (24 hours * 3 days)
+        assert len(result) >= 60, "Should have at least 60 records for 3 days"
+        assert len(result) <= 90, "Should not have more than 90 records for 3 days"
+        
+        # Check data sources
+        sources = set(r['source'] for r in result)
+        print(f"📡 Data sources: {', '.join(sources)}")
+        
+        # Calculate statistics
+        traffic_volumes = [r['traffic_volume'] for r in result]
+        speeds = [r['average_speed'] for r in result]
+        
+        avg_traffic = sum(traffic_volumes) / len(traffic_volumes)
+        max_traffic = max(traffic_volumes)
+        min_traffic = min(traffic_volumes)
+        avg_speed = sum(speeds) / len(speeds)
+        
+        print(f"📈 Statistics:")
+        print(f"   Average traffic: {avg_traffic:.1f}")
+        print(f"   Max traffic: {max_traffic}")
+        print(f"   Min traffic: {min_traffic}")
+        print(f"   Average speed: {avg_speed:.1f} km/h")
+        
+        # Analyze by data source
+        real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
+        synthetic_records = [r for r in result if r['source'] != 'madrid_opendata_zip']
+        
+        print(f"🔍 Data breakdown:")
+        print(f"   Real Madrid data: {len(real_data_records)} records")
+        print(f"   Synthetic data: {len(synthetic_records)} records")
+        
+        if real_data_records:
+            # Show measurement points from real data
+            real_points = set(r['measurement_point_id'] for r in real_data_records)
+            print(f"   Real measurement points: {len(real_points)}")
+            
+        # Sanity checks
+        assert 10 <= avg_traffic <= 500, "Average traffic should be reasonable"
+        assert 10 <= avg_speed <= 60, "Average speed should be reasonable"
+        assert max_traffic >= avg_traffic, "Max should be >= average"
+        assert min_traffic <= avg_traffic, "Min should be <= average"
+    
+    async def test_recent_vs_historical_data(self, client, madrid_coords, june_2025_dates):
+        """Compare recent data (synthetic) vs June 2025 data (potentially real)"""
+        lat, lon = madrid_coords
+        
+        print(f"\n=== Recent vs Historical Data Comparison ===")
+        
+        # Test recent data (should be synthetic)
+        recent_range = june_2025_dates["recent_synthetic"]
+        recent_result = await client.get_historical_traffic(
+            lat, lon, recent_range["start"], recent_range["end"]
+        )
+        
+        # Test June 2025 data (potentially real)
+        june_range = june_2025_dates["quick"]
+        june_result = await client.get_historical_traffic(
+            lat, lon, june_range["start"], june_range["end"]
+        )
+        
+        print(f"📊 Recent data: {len(recent_result)} records")
+        print(f"📊 June 2025 data: {len(june_result)} records")
+        
+        if recent_result:
+            recent_sources = set(r['source'] for r in recent_result)
+            print(f"📡 Recent sources: {', '.join(recent_sources)}")
+        
+        if june_result:
+            june_sources = set(r['source'] for r in june_result)
+            print(f"📡 June sources: {', '.join(june_sources)}")
+            
+            # Check if we successfully got real data from June
+            if 'madrid_opendata_zip' in june_sources:
+                print(f"🎉 SUCCESS: Real Madrid data successfully fetched from June 2025!")
+                
+                # Show details of real data
+                real_records = [r for r in june_result if r['source'] == 'madrid_opendata_zip']
+                if real_records:
+                    sample = real_records[0]
+                    print(f"📋 Real data sample:")
+                    print(f"   Date: {sample['date']}")
+                    print(f"   Traffic volume: {sample['traffic_volume']}")
+                    print(f"   Measurement point: {sample['measurement_point_id']}")
+                    print(f"   Point name: {sample.get('measurement_point_name', 'N/A')}")
+            else:
+                print(f"ℹ️  June data is synthetic (real ZIP may not be accessible)")
+    
+    async def test_madrid_zip_month_code(self, client):
+        """Test the month code calculation for Madrid ZIP files"""
+        print(f"\n=== Madrid ZIP Month Code Test ===")
+        
+        # Test the month code calculation function
+        test_cases = [
+            (2025, 6, 145),  # Known: June 2025 = 145
+            (2025, 5, 144),  # Known: May 2025 = 144
+            (2025, 4, 143),  # Known: April 2025 = 143
+            (2025, 7, 146),  # Predicted: July 2025 = 146
+        ]
+        
+        for year, month, expected_code in test_cases:
+            if hasattr(client, '_calculate_madrid_month_code'):
+                calculated_code = client._calculate_madrid_month_code(year, month)
+                status = "✅" if calculated_code == expected_code else "⚠️"
+                print(f"{status} {year}-{month:02d}: Expected {expected_code}, Got {calculated_code}")
+                
+                # Generate ZIP URL
+                if calculated_code:
+                    zip_url = f"https://datos.madrid.es/egob/catalogo/208627-{calculated_code}-transporte-ptomedida-historico.zip"
+                    print(f"   ZIP URL: {zip_url}")
+            else:
+                print(f"⚠️  Month code calculation function not available")
+    
+    async def test_edge_case_large_date_range(self, client, madrid_coords):
+        """Test edge case: date range too large"""
+        lat, lon = madrid_coords
+        start_time = datetime(2025, 1, 1)  # 6+ months range
+        end_time = datetime(2025, 7, 1)
+        
+        print(f"\n=== Edge Case: Large Date Range ===")
+        print(f"Testing 6-month range: {start_time.date()} to {end_time.date()}")
+        
+        result = await client.get_historical_traffic(lat, lon, start_time, end_time)
+        
+        print(f"📊 Records for 6-month range: {len(result)}")
+        
+        # Should return empty list for ranges > 90 days
+        assert len(result) == 0, "Should return empty list for date ranges > 90 days"
+        print(f"✅ Correctly handled large date range")
+    
+    async def test_edge_case_invalid_coordinates(self, client):
+        """Test edge case: invalid coordinates"""
+        print(f"\n=== Edge Case: Invalid Coordinates ===")
+        
+        start_time = datetime(2025, 6, 1)
+        end_time = datetime(2025, 6, 1, 6, 0)
+        
+        # Test with invalid coordinates
+        result = await client.get_historical_traffic(999.0, 999.0, start_time, end_time)
+        
+        print(f"📊 Records for invalid coords: {len(result)}")
+        
+        # Should either return empty list or synthetic data
+        # The function should not crash
+        assert isinstance(result, list), "Should return list even with invalid coords"
+        print(f"✅ Handled invalid coordinates gracefully")
+    
+    async def test_real_madrid_zip_access(self, client):
+        """Test if we can access the actual Madrid ZIP files"""
+        print(f"\n=== Real Madrid ZIP Access Test ===")
+        
+        # Test the known ZIP URLs you provided
+        test_urls = [
+            "https://datos.madrid.es/egob/catalogo/208627-145-transporte-ptomedida-historico.zip",  # June 2025
+            "https://datos.madrid.es/egob/catalogo/208627-144-transporte-ptomedida-historico.zip",  # May 2025
+            "https://datos.madrid.es/egob/catalogo/208627-143-transporte-ptomedida-historico.zip",  # April 2025
+        ]
+        
+        for i, url in enumerate(test_urls):
+            month_name = ["June 2025", "May 2025", "April 2025"][i]
+            print(f"\nTesting {month_name}: {url}")
+            
+            try:
+                if hasattr(client, '_fetch_historical_zip'):
+                    zip_data = await client._fetch_historical_zip(url)
+                    if zip_data:
+                        print(f"✅ Successfully fetched ZIP: {len(zip_data)} bytes")
+                        
+                        # Try to inspect ZIP contents
+                        try:
+                            import zipfile
+                            from io import BytesIO
+                            
+                            with zipfile.ZipFile(BytesIO(zip_data), 'r') as zip_file:
+                                files = zip_file.namelist()
+                                csv_files = [f for f in files if f.endswith('.csv')]
+                                print(f"📁 ZIP contains {len(files)} files, {len(csv_files)} CSV files")
+                                
+                                if csv_files:
+                                    print(f"   CSV files: {csv_files[:2]}{'...' if len(csv_files) > 2 else ''}")
+                                    
+                        except Exception as e:
+                            print(f"⚠️  Could not inspect ZIP contents: {e}")
+                    else:
+                        print(f"❌ Failed to fetch ZIP")
+                else:
+                    print(f"⚠️  ZIP fetch function not available")
+                    
+            except Exception as e:
+                print(f"❌ Error testing ZIP access: {e}")
+
+
+# Additional standalone test functions for manual running
+async def run_manual_test():
+    """Manual test function that can be run directly"""
+    print("="*60)
+    print("MADRID TRAFFIC TEST - JUNE 2025 DATA")
+    print("="*60)
+    
+    client = MadridOpenDataClient()
+    madrid_lat, madrid_lon = 40.4168, -3.7038
+    
+    # Test with June 2025 data (last available)
+    start_time = datetime(2025, 6, 15, 14, 0)  # June 15, 2025 at 2 PM
+    end_time = datetime(2025, 6, 15, 18, 0)    # Until 6 PM (4 hours)
+    
+    print(f"\nTesting June 15, 2025 data (2 PM - 6 PM)...")
+    print(f"This should include afternoon traffic patterns")
+    
+    result = await client.get_historical_traffic(madrid_lat, madrid_lon, start_time, end_time)
+    
+    print(f"Result: {len(result)} records")
+    
+    if result:
+        sources = set(r['source'] for r in result)
+        print(f"Data sources: {', '.join(sources)}")
+        
+        if 'madrid_opendata_zip' in sources:
+            print(f"🎉 Successfully got real Madrid data!")
+        
+        sample = result[0]
+        print(f"\nSample record:")
+        for key, value in sample.items():
+            if key == "date":
+                print(f"  {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
+            else:
+                print(f"  {key}: {value}")
+    
+    print(f"\n✅ Manual test completed!")
+
+
+if __name__ == "__main__":
+    # If run directly, execute manual test
+    asyncio.run(run_manual_test())