#!/usr/bin/env python3 """ Updated Madrid Historical Traffic test for pytest inside Docker Configured for June 2025 data availability (last available historical data) """ import pytest import asyncio from datetime import datetime, timedelta from typing import List, Dict, Any # Import from the actual service from app.external.madrid_opendata import MadridOpenDataClient from app.core.config import settings import structlog # Configure pytest for async pytestmark = pytest.mark.asyncio # Use actual logger logger = structlog.get_logger() class TestMadridTrafficInside: """Test class for Madrid traffic functionality inside Docker""" @pytest.fixture def client(self): """Create Madrid client for testing""" return MadridOpenDataClient() @pytest.fixture def madrid_coords(self): """Madrid center coordinates""" return 40.4168, -3.7038 @pytest.fixture def june_2025_dates(self): """Date ranges for June 2025 (last available historical data)""" return { "quick": { "start": datetime(2025, 6, 1, 0, 0), "end": datetime(2025, 6, 1, 6, 0) # 6 hours on June 1st }, "one_day": { "start": datetime(2025, 6, 15, 0, 0), # Mid-June "end": datetime(2025, 6, 16, 0, 0) # One full day }, "three_days": { "start": datetime(2025, 6, 10, 0, 0), "end": datetime(2025, 6, 13, 0, 0) # 3 days in June }, "recent_synthetic": { "start": datetime.now() - timedelta(hours=6), "end": datetime.now() # Recent data (will be synthetic) } } async def test_quick_historical_traffic_june2025(self, client, madrid_coords, june_2025_dates): """Test quick historical traffic data from June 2025""" lat, lon = madrid_coords date_range = june_2025_dates["quick"] start_time = date_range["start"] end_time = date_range["end"] print(f"\n=== Quick Test (June 2025 - 6 hours) ===") print(f"Location: {lat}, {lon}") print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}") print(f"Note: Testing with June 2025 data (last available historical month)") # Test the function execution_start = datetime.now() result = await client.get_historical_traffic(lat, lon, start_time, end_time) execution_time = (datetime.now() - execution_start).total_seconds() print(f"ā±ļø Execution time: {execution_time:.2f} seconds") print(f"šŸ“Š Records returned: {len(result)}") # Assertions assert isinstance(result, list), "Result should be a list" assert len(result) > 0, "Should return at least some records" assert execution_time < 30, "Should execute in reasonable time (allowing for ZIP download)" # Check first record structure if result: sample = result[0] print(f"šŸ“‹ Sample record keys: {list(sample.keys())}") print(f"šŸ“” Data source: {sample.get('source', 'unknown')}") # Required fields required_fields = ['date', 'traffic_volume', 'congestion_level', 'average_speed', 'source'] for field in required_fields: assert field in sample, f"Missing required field: {field}" # Data validation assert isinstance(sample['traffic_volume'], int), "Traffic volume should be int" assert 0 <= sample['traffic_volume'] <= 1000, "Traffic volume should be reasonable" assert sample['congestion_level'] in ['low', 'medium', 'high', 'blocked'], "Invalid congestion level" assert 5 <= sample['average_speed'] <= 100, "Speed should be reasonable" assert isinstance(sample['date'], datetime), "Date should be datetime object" # Check if we got real Madrid data or synthetic if sample['source'] == 'madrid_opendata_zip': print(f"šŸŽ‰ SUCCESS: Got real Madrid historical data from ZIP!") else: print(f"ā„¹ļø Got synthetic data (real data may not be available)") print(f"āœ… All validations passed") async def test_one_day_june2025(self, client, madrid_coords, june_2025_dates): """Test one day of June 2025 historical traffic data""" lat, lon = madrid_coords date_range = june_2025_dates["one_day"] start_time = date_range["start"] end_time = date_range["end"] print(f"\n=== One Day Test (June 15, 2025) ===") print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}") result = await client.get_historical_traffic(lat, lon, start_time, end_time) print(f"šŸ“Š Records returned: {len(result)}") # Should have roughly 24 records (one per hour) assert len(result) >= 20, "Should have at least 20 hourly records for one day" assert len(result) <= 30, "Should not have more than 30 records for one day" # Check data source if result: sources = set(r['source'] for r in result) print(f"šŸ“” Data sources: {', '.join(sources)}") # If we got real data, check for realistic measurement point IDs real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip'] if real_data_records: point_ids = set(r['measurement_point_id'] for r in real_data_records) print(f"šŸ·ļø Real measurement points found: {len(point_ids)}") print(f" Sample IDs: {list(point_ids)[:3]}") # Check traffic patterns if len(result) >= 24: # Find rush hour records (7-9 AM, 6-8 PM) rush_hour_records = [r for r in result if 7 <= r['date'].hour <= 9 or 18 <= r['date'].hour <= 20] night_records = [r for r in result if r['date'].hour <= 6 or r['date'].hour >= 22] if rush_hour_records and night_records: avg_rush_traffic = sum(r['traffic_volume'] for r in rush_hour_records) / len(rush_hour_records) avg_night_traffic = sum(r['traffic_volume'] for r in night_records) / len(night_records) print(f"šŸ“ˆ Rush hour avg traffic: {avg_rush_traffic:.1f}") print(f"šŸŒ™ Night avg traffic: {avg_night_traffic:.1f}") # Rush hour should typically have more traffic than night if avg_rush_traffic > avg_night_traffic: print(f"āœ… Traffic patterns look realistic") else: print(f"āš ļø Traffic patterns unusual (not necessarily wrong)") async def test_three_days_june2025(self, client, madrid_coords, june_2025_dates): """Test three days of June 2025 historical traffic data""" lat, lon = madrid_coords date_range = june_2025_dates["three_days"] start_time = date_range["start"] end_time = date_range["end"] print(f"\n=== Three Days Test (June 10-13, 2025) ===") print(f"Date range: {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}") result = await client.get_historical_traffic(lat, lon, start_time, end_time) print(f"šŸ“Š Records returned: {len(result)}") # Should have roughly 72 records (24 hours * 3 days) assert len(result) >= 60, "Should have at least 60 records for 3 days" assert len(result) <= 90, "Should not have more than 90 records for 3 days" # Check data sources sources = set(r['source'] for r in result) print(f"šŸ“” Data sources: {', '.join(sources)}") # Calculate statistics traffic_volumes = [r['traffic_volume'] for r in result] speeds = [r['average_speed'] for r in result] avg_traffic = sum(traffic_volumes) / len(traffic_volumes) max_traffic = max(traffic_volumes) min_traffic = min(traffic_volumes) avg_speed = sum(speeds) / len(speeds) print(f"šŸ“ˆ Statistics:") print(f" Average traffic: {avg_traffic:.1f}") print(f" Max traffic: {max_traffic}") print(f" Min traffic: {min_traffic}") print(f" Average speed: {avg_speed:.1f} km/h") # Analyze by data source real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip'] synthetic_records = [r for r in result if r['source'] != 'madrid_opendata_zip'] print(f"šŸ” Data breakdown:") print(f" Real Madrid data: {len(real_data_records)} records") print(f" Synthetic data: {len(synthetic_records)} records") if real_data_records: # Show measurement points from real data real_points = set(r['measurement_point_id'] for r in real_data_records) print(f" Real measurement points: {len(real_points)}") # Sanity checks assert 10 <= avg_traffic <= 500, "Average traffic should be reasonable" assert 10 <= avg_speed <= 60, "Average speed should be reasonable" assert max_traffic >= avg_traffic, "Max should be >= average" assert min_traffic <= avg_traffic, "Min should be <= average" async def test_recent_vs_historical_data(self, client, madrid_coords, june_2025_dates): """Compare recent data (synthetic) vs June 2025 data (potentially real)""" lat, lon = madrid_coords print(f"\n=== Recent vs Historical Data Comparison ===") # Test recent data (should be synthetic) recent_range = june_2025_dates["recent_synthetic"] recent_result = await client.get_historical_traffic( lat, lon, recent_range["start"], recent_range["end"] ) # Test June 2025 data (potentially real) june_range = june_2025_dates["quick"] june_result = await client.get_historical_traffic( lat, lon, june_range["start"], june_range["end"] ) print(f"šŸ“Š Recent data: {len(recent_result)} records") print(f"šŸ“Š June 2025 data: {len(june_result)} records") if recent_result: recent_sources = set(r['source'] for r in recent_result) print(f"šŸ“” Recent sources: {', '.join(recent_sources)}") if june_result: june_sources = set(r['source'] for r in june_result) print(f"šŸ“” June sources: {', '.join(june_sources)}") # Check if we successfully got real data from June if 'madrid_opendata_zip' in june_sources: print(f"šŸŽ‰ SUCCESS: Real Madrid data successfully fetched from June 2025!") # Show details of real data real_records = [r for r in june_result if r['source'] == 'madrid_opendata_zip'] if real_records: sample = real_records[0] print(f"šŸ“‹ Real data sample:") print(f" Date: {sample['date']}") print(f" Traffic volume: {sample['traffic_volume']}") print(f" Measurement point: {sample['measurement_point_id']}") print(f" Point name: {sample.get('measurement_point_name', 'N/A')}") else: print(f"ā„¹ļø June data is synthetic (real ZIP may not be accessible)") async def test_madrid_zip_month_code(self, client): """Test the month code calculation for Madrid ZIP files""" print(f"\n=== Madrid ZIP Month Code Test ===") # Test the month code calculation function test_cases = [ (2025, 6, 145), # Known: June 2025 = 145 (2025, 5, 144), # Known: May 2025 = 144 (2025, 4, 143), # Known: April 2025 = 143 (2025, 7, 146), # Predicted: July 2025 = 146 ] for year, month, expected_code in test_cases: if hasattr(client, '_calculate_madrid_month_code'): calculated_code = client._calculate_madrid_month_code(year, month) status = "āœ…" if calculated_code == expected_code else "āš ļø" print(f"{status} {year}-{month:02d}: Expected {expected_code}, Got {calculated_code}") # Generate ZIP URL if calculated_code: zip_url = f"https://datos.madrid.es/egob/catalogo/208627-{calculated_code}-transporte-ptomedida-historico.zip" print(f" ZIP URL: {zip_url}") else: print(f"āš ļø Month code calculation function not available") async def test_edge_case_large_date_range(self, client, madrid_coords): """Test edge case: date range too large""" lat, lon = madrid_coords start_time = datetime(2025, 1, 1) # 6+ months range end_time = datetime(2025, 7, 1) print(f"\n=== Edge Case: Large Date Range ===") print(f"Testing 6-month range: {start_time.date()} to {end_time.date()}") result = await client.get_historical_traffic(lat, lon, start_time, end_time) print(f"šŸ“Š Records for 6-month range: {len(result)}") # Should return empty list for ranges > 90 days assert len(result) == 0, "Should return empty list for date ranges > 90 days" print(f"āœ… Correctly handled large date range") async def test_edge_case_invalid_coordinates(self, client): """Test edge case: invalid coordinates""" print(f"\n=== Edge Case: Invalid Coordinates ===") start_time = datetime(2025, 6, 1) end_time = datetime(2025, 6, 1, 6, 0) # Test with invalid coordinates result = await client.get_historical_traffic(999.0, 999.0, start_time, end_time) print(f"šŸ“Š Records for invalid coords: {len(result)}") # Should either return empty list or synthetic data # The function should not crash assert isinstance(result, list), "Should return list even with invalid coords" print(f"āœ… Handled invalid coordinates gracefully") async def test_real_madrid_zip_access(self, client): """Test if we can access the actual Madrid ZIP files""" print(f"\n=== Real Madrid ZIP Access Test ===") # Test the known ZIP URLs you provided test_urls = [ "https://datos.madrid.es/egob/catalogo/208627-145-transporte-ptomedida-historico.zip", # June 2025 "https://datos.madrid.es/egob/catalogo/208627-144-transporte-ptomedida-historico.zip", # May 2025 "https://datos.madrid.es/egob/catalogo/208627-143-transporte-ptomedida-historico.zip", # April 2025 ] for i, url in enumerate(test_urls): month_name = ["June 2025", "May 2025", "April 2025"][i] print(f"\nTesting {month_name}: {url}") try: if hasattr(client, '_fetch_historical_zip'): zip_data = await client._fetch_historical_zip(url) if zip_data: print(f"āœ… Successfully fetched ZIP: {len(zip_data)} bytes") # Try to inspect ZIP contents try: import zipfile from io import BytesIO with zipfile.ZipFile(BytesIO(zip_data), 'r') as zip_file: files = zip_file.namelist() csv_files = [f for f in files if f.endswith('.csv')] print(f"šŸ“ ZIP contains {len(files)} files, {len(csv_files)} CSV files") if csv_files: print(f" CSV files: {csv_files[:2]}{'...' if len(csv_files) > 2 else ''}") except Exception as e: print(f"āš ļø Could not inspect ZIP contents: {e}") else: print(f"āŒ Failed to fetch ZIP") else: print(f"āš ļø ZIP fetch function not available") except Exception as e: print(f"āŒ Error testing ZIP access: {e}") # Additional standalone test functions for manual running async def run_manual_test(): """Manual test function that can be run directly""" print("="*60) print("MADRID TRAFFIC TEST - JUNE 2025 DATA") print("="*60) client = MadridOpenDataClient() madrid_lat, madrid_lon = 40.4168, -3.7038 # Test with June 2025 data (last available) start_time = datetime(2025, 6, 15, 14, 0) # June 15, 2025 at 2 PM end_time = datetime(2025, 6, 15, 18, 0) # Until 6 PM (4 hours) print(f"\nTesting June 15, 2025 data (2 PM - 6 PM)...") print(f"This should include afternoon traffic patterns") result = await client.get_historical_traffic(madrid_lat, madrid_lon, start_time, end_time) print(f"Result: {len(result)} records") if result: sources = set(r['source'] for r in result) print(f"Data sources: {', '.join(sources)}") if 'madrid_opendata_zip' in sources: print(f"šŸŽ‰ Successfully got real Madrid data!") sample = result[0] print(f"\nSample record:") for key, value in sample.items(): if key == "date": print(f" {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}") else: print(f" {key}: {value}") print(f"\nāœ… Manual test completed!") if __name__ == "__main__": # If run directly, execute manual test asyncio.run(run_manual_test())