Add POI feature and imporve the overall backend implementation

This commit is contained in:
Urtzi Alfaro
2025-11-12 15:34:10 +01:00
parent e8096cd979
commit 5783c7ed05
173 changed files with 16862 additions and 9078 deletions

View File

@@ -0,0 +1,269 @@
"""
Competitor Analyzer
Specialized analysis for competitor bakeries with competitive pressure modeling.
Treats competitor proximity differently than other POIs, considering market dynamics.
"""
from typing import Dict, List, Any, Tuple
import structlog
from math import radians, sin, cos, sqrt, atan2
from app.core.poi_config import COMPETITOR_ZONES
logger = structlog.get_logger()
class CompetitorAnalyzer:
"""
Competitive landscape analyzer for bakery locations.
Models competitive pressure considering:
- Direct competition (<100m): Strong negative impact
- Nearby competition (100-500m): Moderate negative impact
- Market saturation (500-1000m): Can be positive (bakery district)
or negative (competitive market)
"""
def analyze_competitive_landscape(
self,
competitor_pois: List[Dict[str, Any]],
bakery_location: Tuple[float, float],
tenant_id: str = None
) -> Dict[str, Any]:
"""
Analyze competitive pressure from nearby bakeries.
Args:
competitor_pois: List of detected competitor POIs
bakery_location: Tuple of (latitude, longitude)
tenant_id: Optional tenant ID for logging
Returns:
Competitive analysis with pressure scores and market classification
"""
if not competitor_pois:
logger.info(
"No competitors detected - underserved market",
tenant_id=tenant_id
)
return {
"competitive_pressure_score": 0.0,
"direct_competitors_count": 0,
"nearby_competitors_count": 0,
"market_competitors_count": 0,
"competitive_zone": "low_competition",
"market_type": "underserved",
"competitive_advantage": "first_mover",
"ml_feature_competitive_pressure": 0.0,
"ml_feature_has_direct_competitor": 0,
"ml_feature_competitor_density_500m": 0,
"competitor_details": []
}
# Categorize competitors by distance
direct_competitors = [] # <100m
nearby_competitors = [] # 100-500m
market_competitors = [] # 500-1000m
competitor_details = []
for poi in competitor_pois:
distance_m = self._calculate_distance(
bakery_location, (poi["lat"], poi["lon"])
) * 1000
competitor_info = {
"name": poi.get("name", "Unnamed"),
"osm_id": poi.get("osm_id"),
"distance_m": round(distance_m, 1),
"lat": poi["lat"],
"lon": poi["lon"]
}
if distance_m < COMPETITOR_ZONES["direct"]["max_distance_m"]:
direct_competitors.append(poi)
competitor_info["zone"] = "direct"
elif distance_m < COMPETITOR_ZONES["nearby"]["max_distance_m"]:
nearby_competitors.append(poi)
competitor_info["zone"] = "nearby"
elif distance_m < COMPETITOR_ZONES["market"]["max_distance_m"]:
market_competitors.append(poi)
competitor_info["zone"] = "market"
competitor_details.append(competitor_info)
# Calculate competitive pressure score
direct_pressure = (
len(direct_competitors) *
COMPETITOR_ZONES["direct"]["pressure_multiplier"]
)
nearby_pressure = (
len(nearby_competitors) *
COMPETITOR_ZONES["nearby"]["pressure_multiplier"]
)
# Market saturation analysis
min_for_district = COMPETITOR_ZONES["market"]["min_count_for_district"]
if len(market_competitors) >= min_for_district:
# Many bakeries = destination area (bakery district)
market_pressure = COMPETITOR_ZONES["market"]["district_multiplier"]
market_type = "bakery_district"
elif len(market_competitors) > 2:
market_pressure = COMPETITOR_ZONES["market"]["normal_multiplier"]
market_type = "competitive_market"
else:
market_pressure = 0.0
market_type = "normal_market"
competitive_pressure_score = (
direct_pressure + nearby_pressure + market_pressure
)
# Determine competitive zone classification
if len(direct_competitors) > 0:
competitive_zone = "high_competition"
competitive_advantage = "differentiation_required"
elif len(nearby_competitors) > 2:
competitive_zone = "moderate_competition"
competitive_advantage = "quality_focused"
else:
competitive_zone = "low_competition"
competitive_advantage = "local_leader"
# Sort competitors by distance
competitor_details.sort(key=lambda x: x["distance_m"])
logger.info(
"Competitive analysis complete",
tenant_id=tenant_id,
competitive_zone=competitive_zone,
market_type=market_type,
total_competitors=len(competitor_pois),
direct=len(direct_competitors),
nearby=len(nearby_competitors),
market=len(market_competitors),
pressure_score=competitive_pressure_score
)
return {
# Summary scores
"competitive_pressure_score": round(competitive_pressure_score, 2),
# Competitor counts by zone
"direct_competitors_count": len(direct_competitors),
"nearby_competitors_count": len(nearby_competitors),
"market_competitors_count": len(market_competitors),
"total_competitors_count": len(competitor_pois),
# Market classification
"competitive_zone": competitive_zone,
"market_type": market_type,
"competitive_advantage": competitive_advantage,
# ML features (for model integration)
"ml_feature_competitive_pressure": round(competitive_pressure_score, 2),
"ml_feature_has_direct_competitor": 1 if len(direct_competitors) > 0 else 0,
"ml_feature_competitor_density_500m": (
len(direct_competitors) + len(nearby_competitors)
),
# Detailed competitor information
"competitor_details": competitor_details,
# Nearest competitor
"nearest_competitor": competitor_details[0] if competitor_details else None
}
def _calculate_distance(
self,
coord1: Tuple[float, float],
coord2: Tuple[float, float]
) -> float:
"""
Calculate Haversine distance in kilometers.
Args:
coord1: Tuple of (latitude, longitude)
coord2: Tuple of (latitude, longitude)
Returns:
Distance in kilometers
"""
lat1, lon1 = coord1
lat2, lon2 = coord2
R = 6371 # Earth radius in km
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = (sin(dlat/2)**2 +
cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
return R * c
def get_competitive_insights(
self,
analysis_result: Dict[str, Any]
) -> List[str]:
"""
Generate human-readable competitive insights.
Args:
analysis_result: Result from analyze_competitive_landscape
Returns:
List of insight strings for business intelligence
"""
insights = []
zone = analysis_result["competitive_zone"]
market = analysis_result["market_type"]
pressure = analysis_result["competitive_pressure_score"]
direct = analysis_result["direct_competitors_count"]
nearby = analysis_result["nearby_competitors_count"]
# Zone-specific insights
if zone == "high_competition":
insights.append(
f"⚠️ High competition: {direct} direct competitor(s) within 100m. "
"Focus on differentiation and quality."
)
elif zone == "moderate_competition":
insights.append(
f"Moderate competition: {nearby} nearby competitor(s) within 500m. "
"Good opportunity for market share."
)
else:
insights.append(
"✅ Low competition: Local market leader opportunity."
)
# Market type insights
if market == "bakery_district":
insights.append(
"📍 Bakery district: High foot traffic area with multiple bakeries. "
"Customers actively seek bakery products here."
)
elif market == "competitive_market":
insights.append(
"Market has multiple bakeries. Quality and customer service critical."
)
elif market == "underserved":
insights.append(
"🎯 Underserved market: Potential for strong customer base growth."
)
# Pressure score insight
if pressure < -1.5:
insights.append(
"Strong competitive pressure expected to impact demand. "
"Marketing and differentiation essential."
)
elif pressure > 0:
insights.append(
"Positive market dynamics: Location benefits from bakery destination traffic."
)
return insights

View File

@@ -0,0 +1,282 @@
"""
Nominatim Geocoding Service
Provides address search and geocoding using OpenStreetMap Nominatim API.
For development: uses public API (rate-limited)
For production: should point to self-hosted Nominatim instance
"""
import httpx
from typing import List, Dict, Any, Optional
import structlog
from asyncio import sleep
logger = structlog.get_logger()
class NominatimService:
"""
Nominatim geocoding and address search service.
Uses OpenStreetMap Nominatim API for address autocomplete and geocoding.
Respects rate limits and usage policy.
"""
# For development: public API (rate-limited to 1 req/sec)
# For production: should be overridden with self-hosted instance
DEFAULT_BASE_URL = "https://nominatim.openstreetmap.org"
def __init__(self, base_url: Optional[str] = None, user_agent: str = "BakeryIA-Forecasting/1.0"):
"""
Initialize Nominatim service.
Args:
base_url: Nominatim server URL (defaults to public API)
user_agent: User agent for API requests (required by Nominatim policy)
"""
self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
self.user_agent = user_agent
self.headers = {
"User-Agent": self.user_agent
}
# Rate limiting for public API (1 request per second)
self.is_public_api = self.base_url == self.DEFAULT_BASE_URL
self.min_request_interval = 1.0 if self.is_public_api else 0.0
logger.info(
"Nominatim service initialized",
base_url=self.base_url,
is_public_api=self.is_public_api,
rate_limit=f"{self.min_request_interval}s" if self.is_public_api else "none"
)
async def search_address(
self,
query: str,
country_code: str = "es",
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Search for addresses matching query (autocomplete).
Args:
query: Address search query
country_code: ISO country code to restrict search (default: Spain)
limit: Maximum number of results
Returns:
List of address suggestions with display_name, lat, lon, osm_id, etc.
"""
if not query or len(query.strip()) < 3:
logger.warning("Search query too short", query=query)
return []
try:
# Rate limiting for public API
if self.is_public_api:
await sleep(self.min_request_interval)
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(
f"{self.base_url}/search",
params={
"q": query,
"format": "json",
"addressdetails": 1,
"countrycodes": country_code,
"limit": limit,
"accept-language": "es"
},
headers=self.headers
)
response.raise_for_status()
results = response.json()
# Parse and enrich results
addresses = []
for result in results:
addresses.append({
"display_name": result.get("display_name"),
"lat": float(result.get("lat")),
"lon": float(result.get("lon")),
"osm_type": result.get("osm_type"),
"osm_id": result.get("osm_id"),
"place_id": result.get("place_id"),
"type": result.get("type"),
"class": result.get("class"),
"address": result.get("address", {}),
"boundingbox": result.get("boundingbox", [])
})
logger.info(
"Address search completed",
query=query,
result_count=len(addresses)
)
return addresses
except httpx.HTTPError as e:
logger.error(
"Nominatim API request failed",
query=query,
error=str(e)
)
return []
except Exception as e:
logger.error(
"Unexpected error in address search",
query=query,
error=str(e),
exc_info=True
)
return []
async def geocode_address(
self,
address: str,
country_code: str = "es"
) -> Optional[Dict[str, Any]]:
"""
Geocode an address to get coordinates.
Args:
address: Full address string
country_code: ISO country code
Returns:
Dictionary with lat, lon, display_name, address components or None
"""
results = await self.search_address(address, country_code, limit=1)
if not results:
logger.warning("No geocoding results found", address=address)
return None
result = results[0]
logger.info(
"Address geocoded successfully",
address=address,
lat=result["lat"],
lon=result["lon"]
)
return result
async def reverse_geocode(
self,
latitude: float,
longitude: float
) -> Optional[Dict[str, Any]]:
"""
Reverse geocode coordinates to get address.
Args:
latitude: Latitude coordinate
longitude: Longitude coordinate
Returns:
Dictionary with address information or None
"""
try:
# Rate limiting for public API
if self.is_public_api:
await sleep(self.min_request_interval)
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(
f"{self.base_url}/reverse",
params={
"lat": latitude,
"lon": longitude,
"format": "json",
"addressdetails": 1,
"accept-language": "es"
},
headers=self.headers
)
response.raise_for_status()
result = response.json()
address_info = {
"display_name": result.get("display_name"),
"lat": float(result.get("lat")),
"lon": float(result.get("lon")),
"osm_type": result.get("osm_type"),
"osm_id": result.get("osm_id"),
"place_id": result.get("place_id"),
"address": result.get("address", {}),
"boundingbox": result.get("boundingbox", [])
}
logger.info(
"Reverse geocoding completed",
lat=latitude,
lon=longitude,
address=address_info["display_name"]
)
return address_info
except httpx.HTTPError as e:
logger.error(
"Nominatim reverse geocoding failed",
lat=latitude,
lon=longitude,
error=str(e)
)
return None
except Exception as e:
logger.error(
"Unexpected error in reverse geocoding",
lat=latitude,
lon=longitude,
error=str(e),
exc_info=True
)
return None
async def validate_coordinates(
self,
latitude: float,
longitude: float
) -> bool:
"""
Validate that coordinates point to a real location.
Args:
latitude: Latitude to validate
longitude: Longitude to validate
Returns:
True if coordinates are valid, False otherwise
"""
if not (-90 <= latitude <= 90 and -180 <= longitude <= 180):
return False
result = await self.reverse_geocode(latitude, longitude)
return result is not None
async def health_check(self) -> bool:
"""
Check if Nominatim service is accessible.
Returns:
True if service is healthy, False otherwise
"""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(
f"{self.base_url}/status",
params={"format": "json"},
headers=self.headers
)
return response.status_code == 200
except Exception as e:
logger.error(
"Nominatim health check failed",
error=str(e)
)
return False

View File

@@ -0,0 +1,466 @@
"""
POI Detection Service
Automated Point of Interest detection using Overpass API (OpenStreetMap).
Detects nearby POIs around bakery locations and generates ML features
for location-based demand forecasting.
"""
import overpy
from typing import List, Dict, Any, Tuple, Optional
from datetime import datetime, timezone, timedelta
import asyncio
import structlog
import httpx
from math import radians, sin, cos, sqrt, atan2
import random
from app.core.poi_config import (
POI_CATEGORIES,
OVERPASS_API_URL,
OVERPASS_TIMEOUT_SECONDS,
OVERPASS_MAX_RETRIES,
OVERPASS_RETRY_DELAY_SECONDS,
DISTANCE_BANDS
)
logger = structlog.get_logger()
class POIDetectionService:
"""
Automated POI detection using Overpass API (OpenStreetMap).
Detects points of interest near bakery locations and calculates
ML features for demand forecasting with location-specific context.
"""
def __init__(self, overpass_url: str = OVERPASS_API_URL):
self.overpass_url = overpass_url
self.api = overpy.Overpass(url=overpass_url)
self.timeout = OVERPASS_TIMEOUT_SECONDS
async def detect_pois_for_bakery(
self,
latitude: float,
longitude: float,
tenant_id: str
) -> Dict[str, Any]:
"""
Detect all POIs around a bakery location.
Args:
latitude: Bakery latitude
longitude: Bakery longitude
tenant_id: Tenant identifier for logging
Returns:
Complete POI detection results with ML features
"""
logger.info(
"Starting POI detection",
tenant_id=tenant_id,
location=(latitude, longitude)
)
poi_results = {}
detection_errors = []
# Query each POI category with inter-query delays
category_items = list(POI_CATEGORIES.items())
for idx, (category_key, category) in enumerate(category_items):
try:
pois = await self._query_pois_with_retry(
latitude,
longitude,
category.osm_query,
category.search_radius_m,
category_key
)
# Calculate features for this category
features = self._calculate_poi_features(
pois,
(latitude, longitude),
category
)
poi_results[category_key] = {
"pois": pois,
"features": features,
"count": len(pois)
}
logger.info(
f"Detected {category_key}",
count=len(pois),
proximity_score=features["proximity_score"]
)
# Add delay between categories to respect rate limits
# (except after the last category)
if idx < len(category_items) - 1:
inter_query_delay = 2.0 + random.uniform(0.5, 1.5)
await asyncio.sleep(inter_query_delay)
except Exception as e:
logger.error(
f"Failed to detect {category_key}",
error=str(e),
tenant_id=tenant_id
)
detection_errors.append({
"category": category_key,
"error": str(e)
})
poi_results[category_key] = {
"pois": [],
"features": self._get_empty_features(),
"count": 0,
"error": str(e)
}
# Add a longer delay after an error before continuing
if idx < len(category_items) - 1:
error_recovery_delay = 3.0 + random.uniform(1.0, 2.0)
await asyncio.sleep(error_recovery_delay)
# Generate combined ML features
ml_features = self._generate_ml_features(poi_results)
# Generate summary
summary = self._generate_summary(poi_results)
detection_status = "completed" if not detection_errors else (
"partial" if len(detection_errors) < len(POI_CATEGORIES) else "failed"
)
return {
"tenant_id": tenant_id,
"location": {"latitude": latitude, "longitude": longitude},
"detection_timestamp": datetime.now(timezone.utc).isoformat(),
"detection_status": detection_status,
"detection_errors": detection_errors if detection_errors else None,
"poi_categories": poi_results,
"ml_features": ml_features,
"summary": summary
}
async def _query_pois_with_retry(
self,
latitude: float,
longitude: float,
osm_query: str,
radius_m: int,
category_key: str
) -> List[Dict[str, Any]]:
"""
Query Overpass API with exponential backoff retry logic.
Implements:
- Exponential backoff with jitter
- Extended delays for rate limiting errors
- Proper error type detection
"""
last_error = None
base_delay = OVERPASS_RETRY_DELAY_SECONDS
for attempt in range(OVERPASS_MAX_RETRIES):
try:
return await self._query_pois(
latitude, longitude, osm_query, radius_m
)
except Exception as e:
last_error = e
error_message = str(e).lower()
# Determine if this is a rate limiting error
is_rate_limit = any(phrase in error_message for phrase in [
'too many requests',
'rate limit',
'server load too high',
'quota exceeded',
'retry later',
'429',
'503',
'504'
])
if attempt < OVERPASS_MAX_RETRIES - 1:
# Calculate exponential backoff with jitter
# For rate limiting: use longer delays (10-30 seconds)
# For other errors: use standard backoff (2-8 seconds)
if is_rate_limit:
delay = base_delay * (3 ** attempt) + random.uniform(1, 5)
delay = min(delay, 30) # Cap at 30 seconds
else:
delay = base_delay * (2 ** attempt) + random.uniform(0.5, 1.5)
delay = min(delay, 10) # Cap at 10 seconds
logger.warning(
f"POI query retry {attempt + 1}/{OVERPASS_MAX_RETRIES}",
category=category_key,
error=str(e),
is_rate_limit=is_rate_limit,
retry_delay=f"{delay:.1f}s"
)
await asyncio.sleep(delay)
else:
logger.error(
"POI query failed after all retries",
category=category_key,
error=str(e),
is_rate_limit=is_rate_limit
)
raise last_error
async def _query_pois(
self,
latitude: float,
longitude: float,
osm_query: str,
radius_m: int
) -> List[Dict[str, Any]]:
"""
Query Overpass API for POIs in radius.
Raises:
Exception: With descriptive error message from Overpass API
"""
# Build Overpass QL query
query = f"""
[out:json][timeout:{self.timeout}];
(
node{osm_query}(around:{radius_m},{latitude},{longitude});
way{osm_query}(around:{radius_m},{latitude},{longitude});
);
out center;
"""
# Execute query (use asyncio thread pool for blocking overpy)
loop = asyncio.get_event_loop()
try:
result = await loop.run_in_executor(
None,
self.api.query,
query
)
except overpy.exception.OverpassTooManyRequests as e:
# Explicitly handle rate limiting
raise Exception("Too many requests - Overpass API rate limit exceeded") from e
except overpy.exception.OverpassGatewayTimeout as e:
# Query took too long
raise Exception("Gateway timeout - query too complex or server busy") from e
except overpy.exception.OverpassBadRequest as e:
# Query syntax error
raise Exception(f"Bad request - invalid query syntax: {str(e)}") from e
except Exception as e:
# Check if it's an HTTP error with status code
error_msg = str(e).lower()
if '429' in error_msg or 'too many' in error_msg:
raise Exception("Too many requests - rate limit exceeded") from e
elif '503' in error_msg or 'load too high' in error_msg:
raise Exception("Server load too high - Overpass API overloaded") from e
elif '504' in error_msg or 'timeout' in error_msg:
raise Exception("Gateway timeout - server busy") from e
else:
# Re-raise with original message
raise
# Parse results
pois = []
# Process nodes
for node in result.nodes:
pois.append({
"osm_id": str(node.id),
"type": "node",
"lat": float(node.lat),
"lon": float(node.lon),
"tags": dict(node.tags),
"name": node.tags.get("name", "Unnamed")
})
# Process ways (buildings, areas)
for way in result.ways:
# Get center point
if hasattr(way, 'center_lat') and way.center_lat:
lat, lon = float(way.center_lat), float(way.center_lon)
else:
# Calculate centroid from nodes
if way.nodes:
lats = [float(node.lat) for node in way.nodes]
lons = [float(node.lon) for node in way.nodes]
lat = sum(lats) / len(lats)
lon = sum(lons) / len(lons)
else:
continue
pois.append({
"osm_id": str(way.id),
"type": "way",
"lat": lat,
"lon": lon,
"tags": dict(way.tags),
"name": way.tags.get("name", "Unnamed")
})
return pois
def _calculate_poi_features(
self,
pois: List[Dict[str, Any]],
bakery_location: Tuple[float, float],
category
) -> Dict[str, float]:
"""Calculate ML features for POI category"""
if not pois:
return self._get_empty_features()
# Calculate distances
distances = []
for poi in pois:
dist_km = self._haversine_distance(
bakery_location,
(poi["lat"], poi["lon"])
)
distances.append(dist_km * 1000) # Convert to meters
# Feature Tier 1: Proximity Scores (PRIMARY)
proximity_score = sum(1.0 / (1.0 + d/1000) for d in distances)
weighted_proximity_score = proximity_score * category.weight
# Feature Tier 2: Distance Band Counts
count_0_100m = sum(1 for d in distances if d <= 100)
count_100_300m = sum(1 for d in distances if 100 < d <= 300)
count_300_500m = sum(1 for d in distances if 300 < d <= 500)
count_500_1000m = sum(1 for d in distances if 500 < d <= 1000)
# Feature Tier 3: Distance to Nearest
distance_to_nearest_m = min(distances) if distances else 9999.0
# Feature Tier 4: Binary Flags
has_within_100m = any(d <= 100 for d in distances)
has_within_300m = any(d <= 300 for d in distances)
has_within_500m = any(d <= 500 for d in distances)
return {
# Tier 1: Proximity scores (PRIMARY for ML)
"proximity_score": round(proximity_score, 4),
"weighted_proximity_score": round(weighted_proximity_score, 4),
# Tier 2: Distance bands
"count_0_100m": count_0_100m,
"count_100_300m": count_100_300m,
"count_300_500m": count_300_500m,
"count_500_1000m": count_500_1000m,
"total_count": len(pois),
# Tier 3: Distance to nearest
"distance_to_nearest_m": round(distance_to_nearest_m, 1),
# Tier 4: Binary flags
"has_within_100m": has_within_100m,
"has_within_300m": has_within_300m,
"has_within_500m": has_within_500m
}
def _generate_ml_features(self, poi_results: Dict[str, Any]) -> Dict[str, float]:
"""
Generate flat feature dictionary for ML model ingestion.
These features will be added to Prophet/XGBoost as regressors.
"""
ml_features = {}
for category_key, data in poi_results.items():
features = data.get("features", {})
# Flatten with category prefix
for feature_name, value in features.items():
ml_feature_name = f"poi_{category_key}_{feature_name}"
# Convert boolean to int for ML
if isinstance(value, bool):
value = 1 if value else 0
ml_features[ml_feature_name] = value
return ml_features
def _get_empty_features(self) -> Dict[str, float]:
"""Return zero features when no POIs found"""
return {
"proximity_score": 0.0,
"weighted_proximity_score": 0.0,
"count_0_100m": 0,
"count_100_300m": 0,
"count_300_500m": 0,
"count_500_1000m": 0,
"total_count": 0,
"distance_to_nearest_m": 9999.0,
"has_within_100m": False,
"has_within_300m": False,
"has_within_500m": False
}
def _haversine_distance(
self,
coord1: Tuple[float, float],
coord2: Tuple[float, float]
) -> float:
"""
Calculate distance between two coordinates in kilometers.
Uses Haversine formula for great-circle distance.
"""
lat1, lon1 = coord1
lat2, lon2 = coord2
R = 6371 # Earth radius in km
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = (sin(dlat/2)**2 +
cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
return R * c
def _generate_summary(self, poi_results: Dict[str, Any]) -> Dict[str, Any]:
"""Generate human-readable summary"""
total_pois = sum(r["count"] for r in poi_results.values())
categories_with_pois = [
k for k, v in poi_results.items() if v["count"] > 0
]
high_impact_categories = [
k for k, v in poi_results.items()
if v["features"]["proximity_score"] > 2.0
]
return {
"total_pois_detected": total_pois,
"categories_with_pois": categories_with_pois,
"high_impact_categories": high_impact_categories,
"categories_count": len(categories_with_pois)
}
async def health_check(self) -> Dict[str, Any]:
"""Check if Overpass API is accessible"""
try:
async with httpx.AsyncClient(timeout=5) as client:
response = await client.get(f"{self.overpass_url}/status")
is_healthy = response.status_code == 200
return {
"healthy": is_healthy,
"status_code": response.status_code,
"url": self.overpass_url
}
except Exception as e:
return {
"healthy": False,
"error": str(e),
"url": self.overpass_url
}

View File

@@ -0,0 +1,184 @@
"""
POI Feature Selector
Determines which POI features are relevant for ML model inclusion.
Filters out low-signal features to prevent model noise and overfitting.
"""
from typing import Dict, List, Any
import structlog
from app.core.poi_config import RELEVANCE_THRESHOLDS
logger = structlog.get_logger()
class POIFeatureSelector:
"""
Feature relevance engine for POI-based ML features.
Applies research-based thresholds to filter out irrelevant POI features
that would add noise to bakery-specific demand forecasting models.
"""
def __init__(self, thresholds: Dict[str, Dict[str, float]] = None):
"""
Initialize feature selector.
Args:
thresholds: Custom relevance thresholds (defaults to RELEVANCE_THRESHOLDS)
"""
self.thresholds = thresholds or RELEVANCE_THRESHOLDS
def select_relevant_features(
self,
poi_detection_results: Dict[str, Any],
tenant_id: str = None
) -> Dict[str, Any]:
"""
Filter POI features based on relevance thresholds.
Only includes features for POI categories that pass relevance tests.
This prevents adding noise to ML models for bakeries where certain
POI categories are not significant.
Args:
poi_detection_results: Full POI detection results
tenant_id: Optional tenant ID for logging
Returns:
Dictionary with relevant features and detailed relevance report
"""
relevant_features = {}
relevance_report = []
relevant_categories = []
for category_key, data in poi_detection_results.items():
features = data.get("features", {})
thresholds = self.thresholds.get(category_key, {})
if not thresholds:
logger.warning(
f"No thresholds defined for category {category_key}",
tenant_id=tenant_id
)
continue
# Check relevance criteria
is_relevant, rejection_reason = self._check_relevance(
features, thresholds, category_key
)
if is_relevant:
# Include features with category prefix
for feature_name, value in features.items():
ml_feature_name = f"poi_{category_key}_{feature_name}"
# Convert boolean to int for ML
if isinstance(value, bool):
value = 1 if value else 0
relevant_features[ml_feature_name] = value
relevant_categories.append(category_key)
relevance_report.append({
"category": category_key,
"relevant": True,
"reason": "Passes all relevance thresholds",
"proximity_score": features.get("proximity_score", 0),
"count": features.get("total_count", 0),
"distance_to_nearest_m": features.get("distance_to_nearest_m", 9999)
})
else:
relevance_report.append({
"category": category_key,
"relevant": False,
"reason": rejection_reason,
"proximity_score": features.get("proximity_score", 0),
"count": features.get("total_count", 0),
"distance_to_nearest_m": features.get("distance_to_nearest_m", 9999)
})
logger.info(
"POI feature selection complete",
tenant_id=tenant_id,
total_categories=len(poi_detection_results),
relevant_categories=len(relevant_categories),
rejected_categories=len(poi_detection_results) - len(relevant_categories)
)
return {
"features": relevant_features,
"relevant_categories": relevant_categories,
"relevance_report": relevance_report,
"total_features": len(relevant_features),
"total_relevant_categories": len(relevant_categories)
}
def _check_relevance(
self,
features: Dict[str, Any],
thresholds: Dict[str, float],
category_key: str
) -> tuple[bool, str]:
"""
Check if POI category passes relevance thresholds.
Returns:
Tuple of (is_relevant, rejection_reason)
"""
# Criterion 1: Proximity score
min_proximity = thresholds.get("min_proximity_score", 0)
actual_proximity = features.get("proximity_score", 0)
if actual_proximity < min_proximity:
return False, f"Proximity score too low ({actual_proximity:.2f} < {min_proximity})"
# Criterion 2: Distance to nearest
max_distance = thresholds.get("max_distance_to_nearest_m", 9999)
actual_distance = features.get("distance_to_nearest_m", 9999)
if actual_distance > max_distance:
return False, f"Nearest POI too far ({actual_distance:.0f}m > {max_distance}m)"
# Criterion 3: Count threshold
min_count = thresholds.get("min_count", 0)
actual_count = features.get("total_count", 0)
if actual_count < min_count:
return False, f"Count too low ({actual_count} < {min_count})"
return True, "Passes all thresholds"
def get_feature_importance_summary(
self,
poi_detection_results: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""
Generate summary of feature importance for all categories.
Useful for understanding POI landscape around a bakery.
"""
summary = []
for category_key, data in poi_detection_results.items():
features = data.get("features", {})
thresholds = self.thresholds.get(category_key, {})
is_relevant, reason = self._check_relevance(
features, thresholds, category_key
) if thresholds else (False, "No thresholds defined")
summary.append({
"category": category_key,
"is_relevant": is_relevant,
"proximity_score": features.get("proximity_score", 0),
"weighted_score": features.get("weighted_proximity_score", 0),
"total_count": features.get("total_count", 0),
"distance_to_nearest_m": features.get("distance_to_nearest_m", 9999),
"has_within_100m": features.get("has_within_100m", False),
"rejection_reason": None if is_relevant else reason
})
# Sort by relevance and proximity score
summary.sort(
key=lambda x: (x["is_relevant"], x["proximity_score"]),
reverse=True
)
return summary

View File

@@ -0,0 +1,468 @@
"""
POI Refresh Service
Manages periodic POI context refresh jobs.
Detects changes in POI landscape and updates tenant POI contexts.
"""
import asyncio
from datetime import datetime, timezone, timedelta
from typing import Optional, Dict, Any, List
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, or_
import structlog
from app.models.poi_refresh_job import POIRefreshJob
from app.models.poi_context import TenantPOIContext
from app.services.poi_detection_service import POIDetectionService
from app.core.database import database_manager
logger = structlog.get_logger()
class POIRefreshService:
"""
POI Refresh Service
Manages background jobs for periodic POI context refresh.
Default refresh cycle: 180 days (6 months).
"""
DEFAULT_REFRESH_INTERVAL_DAYS = 180
DEFAULT_MAX_ATTEMPTS = 3
STALE_THRESHOLD_DAYS = 180
def __init__(
self,
poi_detection_service: Optional[POIDetectionService] = None,
refresh_interval_days: int = DEFAULT_REFRESH_INTERVAL_DAYS
):
"""
Initialize POI refresh service.
Args:
poi_detection_service: POI detection service instance
refresh_interval_days: Days between POI refreshes (default: 180)
"""
self.poi_detection_service = poi_detection_service or POIDetectionService()
self.refresh_interval_days = refresh_interval_days
logger.info(
"POI Refresh Service initialized",
refresh_interval_days=refresh_interval_days
)
async def schedule_refresh_job(
self,
tenant_id: str,
latitude: float,
longitude: float,
scheduled_at: Optional[datetime] = None,
session: Optional[AsyncSession] = None
) -> POIRefreshJob:
"""
Schedule a POI refresh job for a tenant.
Args:
tenant_id: Tenant UUID
latitude: Bakery latitude
longitude: Bakery longitude
scheduled_at: When to run the job (default: now + refresh_interval)
session: Database session
Returns:
Created POIRefreshJob
"""
if scheduled_at is None:
scheduled_at = datetime.now(timezone.utc) + timedelta(
days=self.refresh_interval_days
)
async def _create_job(db_session: AsyncSession):
# Check if pending job already exists
result = await db_session.execute(
select(POIRefreshJob).where(
and_(
POIRefreshJob.tenant_id == tenant_id,
POIRefreshJob.status.in_(["pending", "running"])
)
)
)
existing_job = result.scalar_one_or_none()
if existing_job:
logger.info(
"POI refresh job already scheduled",
tenant_id=tenant_id,
job_id=str(existing_job.id),
scheduled_at=existing_job.scheduled_at
)
return existing_job
# Create new job
job = POIRefreshJob(
tenant_id=tenant_id,
latitude=latitude,
longitude=longitude,
scheduled_at=scheduled_at,
status="pending",
max_attempts=self.DEFAULT_MAX_ATTEMPTS
)
db_session.add(job)
await db_session.commit()
await db_session.refresh(job)
logger.info(
"POI refresh job scheduled",
tenant_id=tenant_id,
job_id=str(job.id),
scheduled_at=scheduled_at
)
return job
if session:
return await _create_job(session)
else:
async with database_manager.get_session() as db_session:
return await _create_job(db_session)
async def execute_refresh_job(
self,
job_id: str,
session: Optional[AsyncSession] = None
) -> Dict[str, Any]:
"""
Execute a POI refresh job.
Args:
job_id: Job UUID
session: Database session
Returns:
Execution result with status and details
"""
async def _execute(db_session: AsyncSession):
# Load job
result = await db_session.execute(
select(POIRefreshJob).where(POIRefreshJob.id == job_id)
)
job = result.scalar_one_or_none()
if not job:
raise ValueError(f"Job not found: {job_id}")
if job.status == "running":
return {
"status": "already_running",
"job_id": str(job.id),
"message": "Job is already running"
}
if job.status == "completed":
return {
"status": "already_completed",
"job_id": str(job.id),
"message": "Job already completed"
}
if not job.can_retry:
return {
"status": "max_attempts_reached",
"job_id": str(job.id),
"message": f"Max attempts ({job.max_attempts}) reached"
}
# Update job status
job.status = "running"
job.started_at = datetime.now(timezone.utc)
job.attempt_count += 1
await db_session.commit()
logger.info(
"Executing POI refresh job",
job_id=str(job.id),
tenant_id=str(job.tenant_id),
attempt=job.attempt_count
)
try:
# Get existing POI context
poi_result = await db_session.execute(
select(TenantPOIContext).where(
TenantPOIContext.tenant_id == job.tenant_id
)
)
existing_context = poi_result.scalar_one_or_none()
# Perform POI detection
detection_result = await self.poi_detection_service.detect_pois_for_bakery(
latitude=job.latitude,
longitude=job.longitude,
tenant_id=str(job.tenant_id),
force_refresh=True
)
# Analyze changes
changes = self._analyze_changes(
existing_context.poi_detection_results if existing_context else {},
detection_result
)
# Update job with results
job.status = "completed"
job.completed_at = datetime.now(timezone.utc)
job.pois_detected = sum(
data.get("count", 0)
for data in detection_result.values()
)
job.changes_detected = changes["has_significant_changes"]
job.change_summary = changes
# Schedule next refresh
job.next_scheduled_at = datetime.now(timezone.utc) + timedelta(
days=self.refresh_interval_days
)
await db_session.commit()
logger.info(
"POI refresh job completed",
job_id=str(job.id),
tenant_id=str(job.tenant_id),
pois_detected=job.pois_detected,
changes_detected=job.changes_detected,
duration_seconds=job.duration_seconds
)
# Schedule next job
await self.schedule_refresh_job(
tenant_id=str(job.tenant_id),
latitude=job.latitude,
longitude=job.longitude,
scheduled_at=job.next_scheduled_at,
session=db_session
)
return {
"status": "success",
"job_id": str(job.id),
"pois_detected": job.pois_detected,
"changes_detected": job.changes_detected,
"change_summary": changes,
"duration_seconds": job.duration_seconds,
"next_scheduled_at": job.next_scheduled_at.isoformat()
}
except Exception as e:
# Job failed
job.status = "failed"
job.completed_at = datetime.now(timezone.utc)
job.error_message = str(e)
job.error_details = {
"error_type": type(e).__name__,
"error_message": str(e),
"attempt": job.attempt_count
}
# Schedule retry if attempts remaining
if job.can_retry:
job.next_scheduled_at = datetime.now(timezone.utc) + timedelta(hours=1)
logger.warning(
"POI refresh job failed, will retry",
job_id=str(job.id),
tenant_id=str(job.tenant_id),
attempt=job.attempt_count,
max_attempts=job.max_attempts,
error=str(e)
)
else:
logger.error(
"POI refresh job failed permanently",
job_id=str(job.id),
tenant_id=str(job.tenant_id),
attempt=job.attempt_count,
error=str(e),
exc_info=True
)
await db_session.commit()
return {
"status": "failed",
"job_id": str(job.id),
"error": str(e),
"attempt": job.attempt_count,
"can_retry": job.can_retry
}
if session:
return await _execute(session)
else:
async with database_manager.get_session() as db_session:
return await _execute(db_session)
def _analyze_changes(
self,
old_results: Dict[str, Any],
new_results: Dict[str, Any]
) -> Dict[str, Any]:
"""
Analyze changes between old and new POI detection results.
Args:
old_results: Previous POI detection results
new_results: New POI detection results
Returns:
Change analysis with significance flag
"""
changes = {
"has_significant_changes": False,
"category_changes": {},
"total_poi_change": 0,
"new_categories": [],
"removed_categories": []
}
old_categories = set(old_results.keys())
new_categories = set(new_results.keys())
# New categories
changes["new_categories"] = list(new_categories - old_categories)
# Removed categories
changes["removed_categories"] = list(old_categories - new_categories)
# Analyze changes per category
for category in new_categories:
old_count = old_results.get(category, {}).get("count", 0)
new_count = new_results.get(category, {}).get("count", 0)
change = new_count - old_count
if abs(change) > 0:
changes["category_changes"][category] = {
"old_count": old_count,
"new_count": new_count,
"change": change,
"change_percent": (change / old_count * 100) if old_count > 0 else 100
}
changes["total_poi_change"] += abs(change)
# Determine if changes are significant
# Significant if: 10+ POIs changed OR 20%+ change OR new/removed categories
total_old_pois = sum(data.get("count", 0) for data in old_results.values())
if total_old_pois > 0:
change_percent = (changes["total_poi_change"] / total_old_pois) * 100
changes["total_change_percent"] = change_percent
changes["has_significant_changes"] = (
changes["total_poi_change"] >= 10
or change_percent >= 20
or len(changes["new_categories"]) > 0
or len(changes["removed_categories"]) > 0
)
else:
changes["has_significant_changes"] = changes["total_poi_change"] > 0
return changes
async def get_pending_jobs(
self,
limit: int = 100,
session: Optional[AsyncSession] = None
) -> List[POIRefreshJob]:
"""
Get pending jobs that are due for execution.
Args:
limit: Maximum number of jobs to return
session: Database session
Returns:
List of pending jobs
"""
async def _get_jobs(db_session: AsyncSession):
result = await db_session.execute(
select(POIRefreshJob)
.where(
and_(
POIRefreshJob.status == "pending",
POIRefreshJob.scheduled_at <= datetime.now(timezone.utc)
)
)
.order_by(POIRefreshJob.scheduled_at)
.limit(limit)
)
return result.scalars().all()
if session:
return await _get_jobs(session)
else:
async with database_manager.get_session() as db_session:
return await _get_jobs(db_session)
async def process_pending_jobs(
self,
max_concurrent: int = 5,
session: Optional[AsyncSession] = None
) -> Dict[str, Any]:
"""
Process all pending jobs concurrently.
Args:
max_concurrent: Maximum concurrent job executions
session: Database session
Returns:
Processing summary
"""
pending_jobs = await self.get_pending_jobs(session=session)
if not pending_jobs:
logger.info("No pending POI refresh jobs")
return {
"total_jobs": 0,
"successful": 0,
"failed": 0,
"results": []
}
logger.info(
"Processing pending POI refresh jobs",
count=len(pending_jobs),
max_concurrent=max_concurrent
)
# Process jobs with concurrency limit
semaphore = asyncio.Semaphore(max_concurrent)
async def process_job(job: POIRefreshJob):
async with semaphore:
return await self.execute_refresh_job(str(job.id))
results = await asyncio.gather(
*[process_job(job) for job in pending_jobs],
return_exceptions=True
)
# Summarize results
successful = sum(1 for r in results if isinstance(r, dict) and r.get("status") == "success")
failed = sum(1 for r in results if isinstance(r, dict) and r.get("status") == "failed")
errors = sum(1 for r in results if isinstance(r, Exception))
summary = {
"total_jobs": len(pending_jobs),
"successful": successful,
"failed": failed + errors,
"results": [r if not isinstance(r, Exception) else {"status": "error", "error": str(r)} for r in results]
}
logger.info(
"POI refresh jobs processing completed",
**summary
)
return summary

View File

@@ -0,0 +1,187 @@
"""
POI Refresh Scheduler
Background scheduler for periodic POI context refresh.
Runs every hour to check for and execute pending POI refresh jobs.
"""
import asyncio
from typing import Optional
from datetime import datetime, timezone
import structlog
from app.services.poi_refresh_service import POIRefreshService
logger = structlog.get_logger()
class POIRefreshScheduler:
"""
POI Refresh Scheduler
Background task that periodically checks for and executes
pending POI refresh jobs.
"""
def __init__(
self,
poi_refresh_service: Optional[POIRefreshService] = None,
check_interval_seconds: int = 3600, # 1 hour
max_concurrent_jobs: int = 5
):
"""
Initialize POI refresh scheduler.
Args:
poi_refresh_service: POI refresh service instance
check_interval_seconds: Seconds between checks (default: 3600 = 1 hour)
max_concurrent_jobs: Max concurrent job executions (default: 5)
"""
self.poi_refresh_service = poi_refresh_service or POIRefreshService()
self.check_interval_seconds = check_interval_seconds
self.max_concurrent_jobs = max_concurrent_jobs
self._task: Optional[asyncio.Task] = None
self._running = False
logger.info(
"POI Refresh Scheduler initialized",
check_interval_seconds=check_interval_seconds,
max_concurrent_jobs=max_concurrent_jobs
)
async def start(self):
"""Start the scheduler background task"""
if self._running:
logger.warning("POI Refresh Scheduler already running")
return
self._running = True
self._task = asyncio.create_task(self._run_scheduler())
logger.info("POI Refresh Scheduler started")
async def stop(self):
"""Stop the scheduler background task"""
if not self._running:
return
self._running = False
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
logger.info("POI Refresh Scheduler stopped")
async def _run_scheduler(self):
"""Main scheduler loop"""
logger.info("POI Refresh Scheduler loop started")
while self._running:
try:
await self._process_cycle()
except Exception as e:
logger.error(
"POI refresh scheduler cycle failed",
error=str(e),
exc_info=True
)
# Wait for next cycle
try:
await asyncio.sleep(self.check_interval_seconds)
except asyncio.CancelledError:
break
logger.info("POI Refresh Scheduler loop ended")
async def _process_cycle(self):
"""Process one scheduler cycle"""
cycle_start = datetime.now(timezone.utc)
logger.debug(
"POI refresh scheduler cycle started",
timestamp=cycle_start.isoformat()
)
# Process pending jobs
result = await self.poi_refresh_service.process_pending_jobs(
max_concurrent=self.max_concurrent_jobs
)
cycle_end = datetime.now(timezone.utc)
cycle_duration = (cycle_end - cycle_start).total_seconds()
if result["total_jobs"] > 0:
logger.info(
"POI refresh scheduler cycle completed",
total_jobs=result["total_jobs"],
successful=result["successful"],
failed=result["failed"],
cycle_duration_seconds=cycle_duration
)
else:
logger.debug(
"POI refresh scheduler cycle completed (no jobs)",
cycle_duration_seconds=cycle_duration
)
async def trigger_immediate_check(self):
"""Trigger an immediate check for pending jobs (bypasses schedule)"""
logger.info("POI refresh scheduler immediate check triggered")
try:
result = await self.poi_refresh_service.process_pending_jobs(
max_concurrent=self.max_concurrent_jobs
)
logger.info(
"POI refresh scheduler immediate check completed",
total_jobs=result["total_jobs"],
successful=result["successful"],
failed=result["failed"]
)
return result
except Exception as e:
logger.error(
"POI refresh scheduler immediate check failed",
error=str(e),
exc_info=True
)
raise
@property
def is_running(self) -> bool:
"""Check if scheduler is running"""
return self._running
# Global scheduler instance
_scheduler_instance: Optional[POIRefreshScheduler] = None
def get_scheduler() -> POIRefreshScheduler:
"""Get global scheduler instance (singleton)"""
global _scheduler_instance
if _scheduler_instance is None:
_scheduler_instance = POIRefreshScheduler()
return _scheduler_instance
async def start_scheduler():
"""Start global POI refresh scheduler"""
scheduler = get_scheduler()
await scheduler.start()
async def stop_scheduler():
"""Stop global POI refresh scheduler"""
scheduler = get_scheduler()
await scheduler.stop()