REFACTOR - MAdrid Open data file
This commit is contained in:
1729
services/data/app/external/madrid_opendata.py
vendored
1729
services/data/app/external/madrid_opendata.py
vendored
File diff suppressed because it is too large
Load Diff
@@ -16,67 +16,3 @@ from app.core.database import Base, get_db
|
||||
from app.models.sales import SalesData
|
||||
from app.models.weather import WeatherData, WeatherForecast
|
||||
from app.models.traffic import TrafficData
|
||||
|
||||
# Test database URL
|
||||
TEST_DATABASE_URL = "sqlite+aiosqlite:///:memory:"
|
||||
|
||||
# Create test engine
|
||||
test_engine = create_async_engine(
|
||||
TEST_DATABASE_URL,
|
||||
connect_args={"check_same_thread": False},
|
||||
poolclass=StaticPool,
|
||||
)
|
||||
|
||||
TestingSessionLocal = async_sessionmaker(
|
||||
test_engine,
|
||||
class_=AsyncSession,
|
||||
expire_on_commit=False
|
||||
)
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def db():
|
||||
"""Create test database session"""
|
||||
async with test_engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
async with TestingSessionLocal() as session:
|
||||
yield session
|
||||
|
||||
async with test_engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.drop_all)
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
"""Create test client"""
|
||||
async def override_get_db():
|
||||
async with TestingSessionLocal() as session:
|
||||
yield session
|
||||
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
|
||||
with TestClient(app) as test_client:
|
||||
yield test_client
|
||||
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@pytest.fixture
|
||||
def test_tenant_id():
|
||||
"""Test tenant ID"""
|
||||
return uuid.uuid4()
|
||||
|
||||
@pytest.fixture
|
||||
def test_sales_data():
|
||||
"""Sample sales data for testing"""
|
||||
return {
|
||||
"date": datetime.now(),
|
||||
"product_name": "Pan Integral",
|
||||
"quantity_sold": 25,
|
||||
"revenue": 37.50,
|
||||
"location_id": "madrid_centro",
|
||||
"source": "test"
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def mock_auth_token():
|
||||
"""Mock authentication token"""
|
||||
return "Bearer test-token-123"
|
||||
16
services/data/tests/pytest.ini
Normal file
16
services/data/tests/pytest.ini
Normal file
@@ -0,0 +1,16 @@
|
||||
[tool:pytest]
|
||||
# pytest.ini - Configuration for async testing
|
||||
asyncio_mode = auto
|
||||
addopts = -v --tb=short --capture=no
|
||||
testpaths = tests
|
||||
python_files = test_*.py
|
||||
python_classes = Test*
|
||||
python_functions = test_*
|
||||
markers =
|
||||
asyncio: mark test as async
|
||||
slow: mark test as slow
|
||||
integration: mark test as integration test
|
||||
filterwarnings =
|
||||
ignore::DeprecationWarning
|
||||
ignore::PendingDeprecationWarning
|
||||
ignore::PydanticDeprecatedSince20
|
||||
405
services/data/tests/test_madrid_opendata.py
Normal file
405
services/data/tests/test_madrid_opendata.py
Normal file
@@ -0,0 +1,405 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Updated Madrid Historical Traffic test for pytest inside Docker
|
||||
Configured for June 2025 data availability (last available historical data)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Import from the actual service
|
||||
from app.external.madrid_opendata import MadridOpenDataClient
|
||||
from app.core.config import settings
|
||||
import structlog
|
||||
|
||||
# Configure pytest for async
|
||||
pytestmark = pytest.mark.asyncio
|
||||
|
||||
# Use actual logger
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class TestMadridTrafficInside:
|
||||
"""Test class for Madrid traffic functionality inside Docker"""
|
||||
|
||||
@pytest.fixture
|
||||
def client(self):
|
||||
"""Create Madrid client for testing"""
|
||||
return MadridOpenDataClient()
|
||||
|
||||
@pytest.fixture
|
||||
def madrid_coords(self):
|
||||
"""Madrid center coordinates"""
|
||||
return 40.4168, -3.7038
|
||||
|
||||
@pytest.fixture
|
||||
def june_2025_dates(self):
|
||||
"""Date ranges for June 2025 (last available historical data)"""
|
||||
return {
|
||||
"quick": {
|
||||
"start": datetime(2025, 6, 1, 0, 0),
|
||||
"end": datetime(2025, 6, 1, 6, 0) # 6 hours on June 1st
|
||||
},
|
||||
"one_day": {
|
||||
"start": datetime(2025, 6, 15, 0, 0), # Mid-June
|
||||
"end": datetime(2025, 6, 16, 0, 0) # One full day
|
||||
},
|
||||
"three_days": {
|
||||
"start": datetime(2025, 6, 10, 0, 0),
|
||||
"end": datetime(2025, 6, 13, 0, 0) # 3 days in June
|
||||
},
|
||||
"recent_synthetic": {
|
||||
"start": datetime.now() - timedelta(hours=6),
|
||||
"end": datetime.now() # Recent data (will be synthetic)
|
||||
}
|
||||
}
|
||||
|
||||
async def test_quick_historical_traffic_june2025(self, client, madrid_coords, june_2025_dates):
|
||||
"""Test quick historical traffic data from June 2025"""
|
||||
lat, lon = madrid_coords
|
||||
date_range = june_2025_dates["quick"]
|
||||
start_time = date_range["start"]
|
||||
end_time = date_range["end"]
|
||||
|
||||
print(f"\n=== Quick Test (June 2025 - 6 hours) ===")
|
||||
print(f"Location: {lat}, {lon}")
|
||||
print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
|
||||
print(f"Note: Testing with June 2025 data (last available historical month)")
|
||||
|
||||
# Test the function
|
||||
execution_start = datetime.now()
|
||||
result = await client.get_historical_traffic(lat, lon, start_time, end_time)
|
||||
execution_time = (datetime.now() - execution_start).total_seconds()
|
||||
|
||||
print(f"⏱️ Execution time: {execution_time:.2f} seconds")
|
||||
print(f"📊 Records returned: {len(result)}")
|
||||
|
||||
# Assertions
|
||||
assert isinstance(result, list), "Result should be a list"
|
||||
assert len(result) > 0, "Should return at least some records"
|
||||
assert execution_time < 30, "Should execute in reasonable time (allowing for ZIP download)"
|
||||
|
||||
# Check first record structure
|
||||
if result:
|
||||
sample = result[0]
|
||||
print(f"📋 Sample record keys: {list(sample.keys())}")
|
||||
print(f"📡 Data source: {sample.get('source', 'unknown')}")
|
||||
|
||||
# Required fields
|
||||
required_fields = ['date', 'traffic_volume', 'congestion_level', 'average_speed', 'source']
|
||||
for field in required_fields:
|
||||
assert field in sample, f"Missing required field: {field}"
|
||||
|
||||
# Data validation
|
||||
assert isinstance(sample['traffic_volume'], int), "Traffic volume should be int"
|
||||
assert 0 <= sample['traffic_volume'] <= 1000, "Traffic volume should be reasonable"
|
||||
assert sample['congestion_level'] in ['low', 'medium', 'high', 'blocked'], "Invalid congestion level"
|
||||
assert 5 <= sample['average_speed'] <= 100, "Speed should be reasonable"
|
||||
assert isinstance(sample['date'], datetime), "Date should be datetime object"
|
||||
|
||||
# Check if we got real Madrid data or synthetic
|
||||
if sample['source'] == 'madrid_opendata_zip':
|
||||
print(f"🎉 SUCCESS: Got real Madrid historical data from ZIP!")
|
||||
else:
|
||||
print(f"ℹ️ Got synthetic data (real data may not be available)")
|
||||
|
||||
print(f"✅ All validations passed")
|
||||
|
||||
async def test_one_day_june2025(self, client, madrid_coords, june_2025_dates):
|
||||
"""Test one day of June 2025 historical traffic data"""
|
||||
lat, lon = madrid_coords
|
||||
date_range = june_2025_dates["one_day"]
|
||||
start_time = date_range["start"]
|
||||
end_time = date_range["end"]
|
||||
|
||||
print(f"\n=== One Day Test (June 15, 2025) ===")
|
||||
print(f"Date range: {start_time.strftime('%Y-%m-%d %H:%M')} to {end_time.strftime('%Y-%m-%d %H:%M')}")
|
||||
|
||||
result = await client.get_historical_traffic(lat, lon, start_time, end_time)
|
||||
|
||||
print(f"📊 Records returned: {len(result)}")
|
||||
|
||||
# Should have roughly 24 records (one per hour)
|
||||
assert len(result) >= 20, "Should have at least 20 hourly records for one day"
|
||||
assert len(result) <= 30, "Should not have more than 30 records for one day"
|
||||
|
||||
# Check data source
|
||||
if result:
|
||||
sources = set(r['source'] for r in result)
|
||||
print(f"📡 Data sources: {', '.join(sources)}")
|
||||
|
||||
# If we got real data, check for realistic measurement point IDs
|
||||
real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
|
||||
if real_data_records:
|
||||
point_ids = set(r['measurement_point_id'] for r in real_data_records)
|
||||
print(f"🏷️ Real measurement points found: {len(point_ids)}")
|
||||
print(f" Sample IDs: {list(point_ids)[:3]}")
|
||||
|
||||
# Check traffic patterns
|
||||
if len(result) >= 24:
|
||||
# Find rush hour records (7-9 AM, 6-8 PM)
|
||||
rush_hour_records = [r for r in result if 7 <= r['date'].hour <= 9 or 18 <= r['date'].hour <= 20]
|
||||
night_records = [r for r in result if r['date'].hour <= 6 or r['date'].hour >= 22]
|
||||
|
||||
if rush_hour_records and night_records:
|
||||
avg_rush_traffic = sum(r['traffic_volume'] for r in rush_hour_records) / len(rush_hour_records)
|
||||
avg_night_traffic = sum(r['traffic_volume'] for r in night_records) / len(night_records)
|
||||
|
||||
print(f"📈 Rush hour avg traffic: {avg_rush_traffic:.1f}")
|
||||
print(f"🌙 Night avg traffic: {avg_night_traffic:.1f}")
|
||||
|
||||
# Rush hour should typically have more traffic than night
|
||||
if avg_rush_traffic > avg_night_traffic:
|
||||
print(f"✅ Traffic patterns look realistic")
|
||||
else:
|
||||
print(f"⚠️ Traffic patterns unusual (not necessarily wrong)")
|
||||
|
||||
async def test_three_days_june2025(self, client, madrid_coords, june_2025_dates):
|
||||
"""Test three days of June 2025 historical traffic data"""
|
||||
lat, lon = madrid_coords
|
||||
date_range = june_2025_dates["three_days"]
|
||||
start_time = date_range["start"]
|
||||
end_time = date_range["end"]
|
||||
|
||||
print(f"\n=== Three Days Test (June 10-13, 2025) ===")
|
||||
print(f"Date range: {start_time.strftime('%Y-%m-%d')} to {end_time.strftime('%Y-%m-%d')}")
|
||||
|
||||
result = await client.get_historical_traffic(lat, lon, start_time, end_time)
|
||||
|
||||
print(f"📊 Records returned: {len(result)}")
|
||||
|
||||
# Should have roughly 72 records (24 hours * 3 days)
|
||||
assert len(result) >= 60, "Should have at least 60 records for 3 days"
|
||||
assert len(result) <= 90, "Should not have more than 90 records for 3 days"
|
||||
|
||||
# Check data sources
|
||||
sources = set(r['source'] for r in result)
|
||||
print(f"📡 Data sources: {', '.join(sources)}")
|
||||
|
||||
# Calculate statistics
|
||||
traffic_volumes = [r['traffic_volume'] for r in result]
|
||||
speeds = [r['average_speed'] for r in result]
|
||||
|
||||
avg_traffic = sum(traffic_volumes) / len(traffic_volumes)
|
||||
max_traffic = max(traffic_volumes)
|
||||
min_traffic = min(traffic_volumes)
|
||||
avg_speed = sum(speeds) / len(speeds)
|
||||
|
||||
print(f"📈 Statistics:")
|
||||
print(f" Average traffic: {avg_traffic:.1f}")
|
||||
print(f" Max traffic: {max_traffic}")
|
||||
print(f" Min traffic: {min_traffic}")
|
||||
print(f" Average speed: {avg_speed:.1f} km/h")
|
||||
|
||||
# Analyze by data source
|
||||
real_data_records = [r for r in result if r['source'] == 'madrid_opendata_zip']
|
||||
synthetic_records = [r for r in result if r['source'] != 'madrid_opendata_zip']
|
||||
|
||||
print(f"🔍 Data breakdown:")
|
||||
print(f" Real Madrid data: {len(real_data_records)} records")
|
||||
print(f" Synthetic data: {len(synthetic_records)} records")
|
||||
|
||||
if real_data_records:
|
||||
# Show measurement points from real data
|
||||
real_points = set(r['measurement_point_id'] for r in real_data_records)
|
||||
print(f" Real measurement points: {len(real_points)}")
|
||||
|
||||
# Sanity checks
|
||||
assert 10 <= avg_traffic <= 500, "Average traffic should be reasonable"
|
||||
assert 10 <= avg_speed <= 60, "Average speed should be reasonable"
|
||||
assert max_traffic >= avg_traffic, "Max should be >= average"
|
||||
assert min_traffic <= avg_traffic, "Min should be <= average"
|
||||
|
||||
async def test_recent_vs_historical_data(self, client, madrid_coords, june_2025_dates):
|
||||
"""Compare recent data (synthetic) vs June 2025 data (potentially real)"""
|
||||
lat, lon = madrid_coords
|
||||
|
||||
print(f"\n=== Recent vs Historical Data Comparison ===")
|
||||
|
||||
# Test recent data (should be synthetic)
|
||||
recent_range = june_2025_dates["recent_synthetic"]
|
||||
recent_result = await client.get_historical_traffic(
|
||||
lat, lon, recent_range["start"], recent_range["end"]
|
||||
)
|
||||
|
||||
# Test June 2025 data (potentially real)
|
||||
june_range = june_2025_dates["quick"]
|
||||
june_result = await client.get_historical_traffic(
|
||||
lat, lon, june_range["start"], june_range["end"]
|
||||
)
|
||||
|
||||
print(f"📊 Recent data: {len(recent_result)} records")
|
||||
print(f"📊 June 2025 data: {len(june_result)} records")
|
||||
|
||||
if recent_result:
|
||||
recent_sources = set(r['source'] for r in recent_result)
|
||||
print(f"📡 Recent sources: {', '.join(recent_sources)}")
|
||||
|
||||
if june_result:
|
||||
june_sources = set(r['source'] for r in june_result)
|
||||
print(f"📡 June sources: {', '.join(june_sources)}")
|
||||
|
||||
# Check if we successfully got real data from June
|
||||
if 'madrid_opendata_zip' in june_sources:
|
||||
print(f"🎉 SUCCESS: Real Madrid data successfully fetched from June 2025!")
|
||||
|
||||
# Show details of real data
|
||||
real_records = [r for r in june_result if r['source'] == 'madrid_opendata_zip']
|
||||
if real_records:
|
||||
sample = real_records[0]
|
||||
print(f"📋 Real data sample:")
|
||||
print(f" Date: {sample['date']}")
|
||||
print(f" Traffic volume: {sample['traffic_volume']}")
|
||||
print(f" Measurement point: {sample['measurement_point_id']}")
|
||||
print(f" Point name: {sample.get('measurement_point_name', 'N/A')}")
|
||||
else:
|
||||
print(f"ℹ️ June data is synthetic (real ZIP may not be accessible)")
|
||||
|
||||
async def test_madrid_zip_month_code(self, client):
|
||||
"""Test the month code calculation for Madrid ZIP files"""
|
||||
print(f"\n=== Madrid ZIP Month Code Test ===")
|
||||
|
||||
# Test the month code calculation function
|
||||
test_cases = [
|
||||
(2025, 6, 145), # Known: June 2025 = 145
|
||||
(2025, 5, 144), # Known: May 2025 = 144
|
||||
(2025, 4, 143), # Known: April 2025 = 143
|
||||
(2025, 7, 146), # Predicted: July 2025 = 146
|
||||
]
|
||||
|
||||
for year, month, expected_code in test_cases:
|
||||
if hasattr(client, '_calculate_madrid_month_code'):
|
||||
calculated_code = client._calculate_madrid_month_code(year, month)
|
||||
status = "✅" if calculated_code == expected_code else "⚠️"
|
||||
print(f"{status} {year}-{month:02d}: Expected {expected_code}, Got {calculated_code}")
|
||||
|
||||
# Generate ZIP URL
|
||||
if calculated_code:
|
||||
zip_url = f"https://datos.madrid.es/egob/catalogo/208627-{calculated_code}-transporte-ptomedida-historico.zip"
|
||||
print(f" ZIP URL: {zip_url}")
|
||||
else:
|
||||
print(f"⚠️ Month code calculation function not available")
|
||||
|
||||
async def test_edge_case_large_date_range(self, client, madrid_coords):
|
||||
"""Test edge case: date range too large"""
|
||||
lat, lon = madrid_coords
|
||||
start_time = datetime(2025, 1, 1) # 6+ months range
|
||||
end_time = datetime(2025, 7, 1)
|
||||
|
||||
print(f"\n=== Edge Case: Large Date Range ===")
|
||||
print(f"Testing 6-month range: {start_time.date()} to {end_time.date()}")
|
||||
|
||||
result = await client.get_historical_traffic(lat, lon, start_time, end_time)
|
||||
|
||||
print(f"📊 Records for 6-month range: {len(result)}")
|
||||
|
||||
# Should return empty list for ranges > 90 days
|
||||
assert len(result) == 0, "Should return empty list for date ranges > 90 days"
|
||||
print(f"✅ Correctly handled large date range")
|
||||
|
||||
async def test_edge_case_invalid_coordinates(self, client):
|
||||
"""Test edge case: invalid coordinates"""
|
||||
print(f"\n=== Edge Case: Invalid Coordinates ===")
|
||||
|
||||
start_time = datetime(2025, 6, 1)
|
||||
end_time = datetime(2025, 6, 1, 6, 0)
|
||||
|
||||
# Test with invalid coordinates
|
||||
result = await client.get_historical_traffic(999.0, 999.0, start_time, end_time)
|
||||
|
||||
print(f"📊 Records for invalid coords: {len(result)}")
|
||||
|
||||
# Should either return empty list or synthetic data
|
||||
# The function should not crash
|
||||
assert isinstance(result, list), "Should return list even with invalid coords"
|
||||
print(f"✅ Handled invalid coordinates gracefully")
|
||||
|
||||
async def test_real_madrid_zip_access(self, client):
|
||||
"""Test if we can access the actual Madrid ZIP files"""
|
||||
print(f"\n=== Real Madrid ZIP Access Test ===")
|
||||
|
||||
# Test the known ZIP URLs you provided
|
||||
test_urls = [
|
||||
"https://datos.madrid.es/egob/catalogo/208627-145-transporte-ptomedida-historico.zip", # June 2025
|
||||
"https://datos.madrid.es/egob/catalogo/208627-144-transporte-ptomedida-historico.zip", # May 2025
|
||||
"https://datos.madrid.es/egob/catalogo/208627-143-transporte-ptomedida-historico.zip", # April 2025
|
||||
]
|
||||
|
||||
for i, url in enumerate(test_urls):
|
||||
month_name = ["June 2025", "May 2025", "April 2025"][i]
|
||||
print(f"\nTesting {month_name}: {url}")
|
||||
|
||||
try:
|
||||
if hasattr(client, '_fetch_historical_zip'):
|
||||
zip_data = await client._fetch_historical_zip(url)
|
||||
if zip_data:
|
||||
print(f"✅ Successfully fetched ZIP: {len(zip_data)} bytes")
|
||||
|
||||
# Try to inspect ZIP contents
|
||||
try:
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
|
||||
with zipfile.ZipFile(BytesIO(zip_data), 'r') as zip_file:
|
||||
files = zip_file.namelist()
|
||||
csv_files = [f for f in files if f.endswith('.csv')]
|
||||
print(f"📁 ZIP contains {len(files)} files, {len(csv_files)} CSV files")
|
||||
|
||||
if csv_files:
|
||||
print(f" CSV files: {csv_files[:2]}{'...' if len(csv_files) > 2 else ''}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not inspect ZIP contents: {e}")
|
||||
else:
|
||||
print(f"❌ Failed to fetch ZIP")
|
||||
else:
|
||||
print(f"⚠️ ZIP fetch function not available")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing ZIP access: {e}")
|
||||
|
||||
|
||||
# Additional standalone test functions for manual running
|
||||
async def run_manual_test():
|
||||
"""Manual test function that can be run directly"""
|
||||
print("="*60)
|
||||
print("MADRID TRAFFIC TEST - JUNE 2025 DATA")
|
||||
print("="*60)
|
||||
|
||||
client = MadridOpenDataClient()
|
||||
madrid_lat, madrid_lon = 40.4168, -3.7038
|
||||
|
||||
# Test with June 2025 data (last available)
|
||||
start_time = datetime(2025, 6, 15, 14, 0) # June 15, 2025 at 2 PM
|
||||
end_time = datetime(2025, 6, 15, 18, 0) # Until 6 PM (4 hours)
|
||||
|
||||
print(f"\nTesting June 15, 2025 data (2 PM - 6 PM)...")
|
||||
print(f"This should include afternoon traffic patterns")
|
||||
|
||||
result = await client.get_historical_traffic(madrid_lat, madrid_lon, start_time, end_time)
|
||||
|
||||
print(f"Result: {len(result)} records")
|
||||
|
||||
if result:
|
||||
sources = set(r['source'] for r in result)
|
||||
print(f"Data sources: {', '.join(sources)}")
|
||||
|
||||
if 'madrid_opendata_zip' in sources:
|
||||
print(f"🎉 Successfully got real Madrid data!")
|
||||
|
||||
sample = result[0]
|
||||
print(f"\nSample record:")
|
||||
for key, value in sample.items():
|
||||
if key == "date":
|
||||
print(f" {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
else:
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(f"\n✅ Manual test completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# If run directly, execute manual test
|
||||
asyncio.run(run_manual_test())
|
||||
Reference in New Issue
Block a user