Files
bakery-ia/services/forecasting/app/ml/pattern_detector.py

594 lines
22 KiB
Python

"""
Pattern Detection Engine for Sales Data
Automatically identifies patterns and generates insights
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime, timedelta
import structlog
from scipy import stats
from collections import defaultdict
logger = structlog.get_logger()
class SalesPatternDetector:
"""
Detect sales patterns and generate actionable insights.
Patterns detected:
- Time-of-day patterns (hourly peaks)
- Day-of-week patterns (weekend spikes)
- Weekly seasonality patterns
- Monthly patterns
- Holiday impact patterns
- Weather correlation patterns
"""
def __init__(self, significance_threshold: float = 0.15):
"""
Initialize pattern detector.
Args:
significance_threshold: Minimum percentage difference to consider significant (default 15%)
"""
self.significance_threshold = significance_threshold
self.detected_patterns = []
async def detect_all_patterns(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int = 70
) -> List[Dict[str, Any]]:
"""
Detect all patterns in sales data and generate insights.
Args:
tenant_id: Tenant identifier
inventory_product_id: Product identifier
sales_data: Sales data with columns: date, quantity, (optional: hour, temperature, etc.)
min_confidence: Minimum confidence score for insights
Returns:
List of insight dictionaries ready for AI Insights Service
"""
logger.info(
"Starting pattern detection",
tenant_id=tenant_id,
product_id=inventory_product_id,
data_points=len(sales_data)
)
insights = []
# Ensure date column is datetime
if 'date' in sales_data.columns:
sales_data['date'] = pd.to_datetime(sales_data['date'])
# 1. Day-of-week patterns
dow_insights = await self._detect_day_of_week_patterns(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(dow_insights)
# 2. Weekend vs weekday patterns
weekend_insights = await self._detect_weekend_patterns(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(weekend_insights)
# 3. Month-end patterns
month_end_insights = await self._detect_month_end_patterns(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(month_end_insights)
# 4. Hourly patterns (if hour data available)
if 'hour' in sales_data.columns:
hourly_insights = await self._detect_hourly_patterns(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(hourly_insights)
# 5. Weather correlation (if temperature data available)
if 'temperature' in sales_data.columns:
weather_insights = await self._detect_weather_correlations(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(weather_insights)
# 6. Trend detection
trend_insights = await self._detect_trends(
tenant_id, inventory_product_id, sales_data, min_confidence
)
insights.extend(trend_insights)
logger.info(
"Pattern detection complete",
total_insights=len(insights),
product_id=inventory_product_id
)
return insights
async def _detect_day_of_week_patterns(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect day-of-week patterns (e.g., Friday sales spike)."""
insights = []
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
return insights
# Add day of week
sales_data['day_of_week'] = sales_data['date'].dt.dayofweek
sales_data['day_name'] = sales_data['date'].dt.day_name()
# Calculate average sales per day of week
dow_avg = sales_data.groupby(['day_of_week', 'day_name'])['quantity'].agg(['mean', 'count']).reset_index()
# Only consider days with sufficient data (at least 4 observations)
dow_avg = dow_avg[dow_avg['count'] >= 4]
if len(dow_avg) < 2:
return insights
overall_avg = sales_data['quantity'].mean()
# Find days significantly above average
for _, row in dow_avg.iterrows():
day_avg = row['mean']
pct_diff = ((day_avg - overall_avg) / overall_avg) * 100
if abs(pct_diff) > self.significance_threshold * 100:
# Calculate confidence based on sample size and consistency
confidence = self._calculate_pattern_confidence(
sample_size=int(row['count']),
effect_size=abs(pct_diff) / 100,
variability=sales_data['quantity'].std()
)
if confidence >= min_confidence:
if pct_diff > 0:
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='pattern',
category='sales',
priority='medium' if pct_diff > 20 else 'low',
title=f'{row["day_name"]} Sales Pattern Detected',
description=f'Sales on {row["day_name"]} are {abs(pct_diff):.1f}% {"higher" if pct_diff > 0 else "lower"} than average ({day_avg:.1f} vs {overall_avg:.1f} units).',
confidence=confidence,
metrics={
'day_of_week': row['day_name'],
'avg_sales': float(day_avg),
'overall_avg': float(overall_avg),
'difference_pct': float(pct_diff),
'sample_size': int(row['count'])
},
actionable=True,
actions=[
{'label': 'Adjust Production', 'action': 'adjust_daily_production'},
{'label': 'Review Schedule', 'action': 'review_production_schedule'}
]
)
insights.append(insight)
return insights
async def _detect_weekend_patterns(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect weekend vs weekday patterns."""
insights = []
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
return insights
# Classify weekend vs weekday
sales_data['is_weekend'] = sales_data['date'].dt.dayofweek.isin([5, 6])
# Calculate averages
weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean()
weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean()
weekend_count = sales_data[sales_data['is_weekend']]['quantity'].count()
weekday_count = sales_data[~sales_data['is_weekend']]['quantity'].count()
if weekend_count < 4 or weekday_count < 4:
return insights
pct_diff = ((weekend_avg - weekday_avg) / weekday_avg) * 100
if abs(pct_diff) > self.significance_threshold * 100:
confidence = self._calculate_pattern_confidence(
sample_size=min(weekend_count, weekday_count),
effect_size=abs(pct_diff) / 100,
variability=sales_data['quantity'].std()
)
if confidence >= min_confidence:
# Estimate revenue impact
impact_value = abs(weekend_avg - weekday_avg) * 8 * 4 # 8 weekend days per month
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='recommendation',
category='forecasting',
priority='high' if abs(pct_diff) > 25 else 'medium',
title=f'Weekend Demand Pattern: {abs(pct_diff):.0f}% {"Higher" if pct_diff > 0 else "Lower"}',
description=f'Weekend sales average {weekend_avg:.1f} units vs {weekday_avg:.1f} on weekdays ({abs(pct_diff):.0f}% {"increase" if pct_diff > 0 else "decrease"}). Recommend adjusting weekend production targets.',
confidence=confidence,
impact_type='revenue_increase' if pct_diff > 0 else 'cost_savings',
impact_value=float(impact_value),
impact_unit='units/month',
metrics={
'weekend_avg': float(weekend_avg),
'weekday_avg': float(weekday_avg),
'difference_pct': float(pct_diff),
'weekend_samples': int(weekend_count),
'weekday_samples': int(weekday_count)
},
actionable=True,
actions=[
{'label': 'Increase Weekend Production', 'action': 'adjust_weekend_production'},
{'label': 'Update Forecast Multiplier', 'action': 'update_forecast_rule'}
]
)
insights.append(insight)
return insights
async def _detect_month_end_patterns(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect month-end and payday patterns."""
insights = []
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
return insights
# Identify payday periods (15th and last 3 days of month)
sales_data['day_of_month'] = sales_data['date'].dt.day
sales_data['is_payday'] = (
(sales_data['day_of_month'] == 15) |
(sales_data['date'].dt.is_month_end) |
(sales_data['day_of_month'] >= sales_data['date'].dt.days_in_month - 2)
)
payday_avg = sales_data[sales_data['is_payday']]['quantity'].mean()
regular_avg = sales_data[~sales_data['is_payday']]['quantity'].mean()
payday_count = sales_data[sales_data['is_payday']]['quantity'].count()
if payday_count < 4:
return insights
pct_diff = ((payday_avg - regular_avg) / regular_avg) * 100
if abs(pct_diff) > self.significance_threshold * 100:
confidence = self._calculate_pattern_confidence(
sample_size=payday_count,
effect_size=abs(pct_diff) / 100,
variability=sales_data['quantity'].std()
)
if confidence >= min_confidence and pct_diff > 0:
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='pattern',
category='sales',
priority='medium',
title=f'Payday Shopping Pattern Detected',
description=f'Sales increase {pct_diff:.0f}% during payday periods (15th and month-end). Average {payday_avg:.1f} vs {regular_avg:.1f} units.',
confidence=confidence,
metrics={
'payday_avg': float(payday_avg),
'regular_avg': float(regular_avg),
'difference_pct': float(pct_diff)
},
actionable=True,
actions=[
{'label': 'Increase Payday Stock', 'action': 'adjust_payday_production'}
]
)
insights.append(insight)
return insights
async def _detect_hourly_patterns(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect hourly sales patterns (if POS data available)."""
insights = []
if 'hour' not in sales_data.columns or 'quantity' not in sales_data.columns:
return insights
hourly_avg = sales_data.groupby('hour')['quantity'].agg(['mean', 'count']).reset_index()
hourly_avg = hourly_avg[hourly_avg['count'] >= 3] # At least 3 observations
if len(hourly_avg) < 3:
return insights
overall_avg = sales_data['quantity'].mean()
# Find peak hours (top 3)
top_hours = hourly_avg.nlargest(3, 'mean')
for _, row in top_hours.iterrows():
hour_avg = row['mean']
pct_diff = ((hour_avg - overall_avg) / overall_avg) * 100
if pct_diff > self.significance_threshold * 100:
confidence = self._calculate_pattern_confidence(
sample_size=int(row['count']),
effect_size=pct_diff / 100,
variability=sales_data['quantity'].std()
)
if confidence >= min_confidence:
hour = int(row['hour'])
time_label = f"{hour:02d}:00-{(hour+1):02d}:00"
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='pattern',
category='sales',
priority='low',
title=f'Peak Sales Hour: {time_label}',
description=f'Sales peak during {time_label} with {hour_avg:.1f} units ({pct_diff:.0f}% above average).',
confidence=confidence,
metrics={
'peak_hour': hour,
'avg_sales': float(hour_avg),
'overall_avg': float(overall_avg),
'difference_pct': float(pct_diff)
},
actionable=True,
actions=[
{'label': 'Ensure Fresh Stock', 'action': 'schedule_production'},
{'label': 'Increase Staffing', 'action': 'adjust_staffing'}
]
)
insights.append(insight)
return insights
async def _detect_weather_correlations(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect weather-sales correlations."""
insights = []
if 'temperature' not in sales_data.columns or 'quantity' not in sales_data.columns:
return insights
# Remove NaN values
clean_data = sales_data[['temperature', 'quantity']].dropna()
if len(clean_data) < 30: # Need sufficient data
return insights
# Calculate correlation
correlation, p_value = stats.pearsonr(clean_data['temperature'], clean_data['quantity'])
if abs(correlation) > 0.3 and p_value < 0.05: # Moderate correlation and significant
confidence = self._calculate_correlation_confidence(correlation, p_value, len(clean_data))
if confidence >= min_confidence:
direction = 'increase' if correlation > 0 else 'decrease'
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='insight',
category='forecasting',
priority='medium' if abs(correlation) > 0.5 else 'low',
title=f'Temperature Impact on Sales: {abs(correlation):.0%} Correlation',
description=f'Sales {direction} with temperature (correlation: {correlation:.2f}). {"Warmer" if correlation > 0 else "Colder"} weather associated with {"higher" if correlation > 0 else "lower"} sales.',
confidence=confidence,
metrics={
'correlation': float(correlation),
'p_value': float(p_value),
'sample_size': len(clean_data),
'direction': direction
},
actionable=False
)
insights.append(insight)
return insights
async def _detect_trends(
self,
tenant_id: str,
inventory_product_id: str,
sales_data: pd.DataFrame,
min_confidence: int
) -> List[Dict[str, Any]]:
"""Detect overall trends (growing, declining, stable)."""
insights = []
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns or len(sales_data) < 60:
return insights
# Sort by date
sales_data = sales_data.sort_values('date')
# Calculate 30-day rolling average
sales_data['rolling_30d'] = sales_data['quantity'].rolling(window=30, min_periods=15).mean()
# Compare first and last 30-day averages
first_30_avg = sales_data['rolling_30d'].iloc[:30].mean()
last_30_avg = sales_data['rolling_30d'].iloc[-30:].mean()
if pd.isna(first_30_avg) or pd.isna(last_30_avg):
return insights
pct_change = ((last_30_avg - first_30_avg) / first_30_avg) * 100
if abs(pct_change) > 10: # 10% change is significant
confidence = min(95, 70 + int(abs(pct_change))) # Higher change = higher confidence
trend_type = 'growing' if pct_change > 0 else 'declining'
insight = self._create_insight(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
insight_type='prediction',
category='forecasting',
priority='high' if abs(pct_change) > 20 else 'medium',
title=f'Sales Trend: {trend_type.title()} {abs(pct_change):.0f}%',
description=f'Sales show a {trend_type} trend over the period. Current 30-day average: {last_30_avg:.1f} vs earlier: {first_30_avg:.1f} ({pct_change:+.0f}%).',
confidence=confidence,
metrics={
'current_avg': float(last_30_avg),
'previous_avg': float(first_30_avg),
'change_pct': float(pct_change),
'trend': trend_type
},
actionable=True,
actions=[
{'label': 'Adjust Forecast Model', 'action': 'update_forecast'},
{'label': 'Review Capacity', 'action': 'review_production_capacity'}
]
)
insights.append(insight)
return insights
def _calculate_pattern_confidence(
self,
sample_size: int,
effect_size: float,
variability: float
) -> int:
"""
Calculate confidence score for detected pattern.
Args:
sample_size: Number of observations
effect_size: Size of the effect (e.g., 0.25 for 25% difference)
variability: Standard deviation of data
Returns:
Confidence score (0-100)
"""
# Base confidence from sample size
if sample_size < 4:
base = 50
elif sample_size < 10:
base = 65
elif sample_size < 30:
base = 75
elif sample_size < 100:
base = 85
else:
base = 90
# Adjust for effect size
effect_boost = min(15, effect_size * 30)
# Adjust for variability (penalize high variability)
variability_penalty = min(10, variability / 10)
confidence = base + effect_boost - variability_penalty
return int(max(0, min(100, confidence)))
def _calculate_correlation_confidence(
self,
correlation: float,
p_value: float,
sample_size: int
) -> int:
"""Calculate confidence for correlation insights."""
# Base confidence from correlation strength
base = abs(correlation) * 100
# Boost for significance
if p_value < 0.001:
significance_boost = 15
elif p_value < 0.01:
significance_boost = 10
elif p_value < 0.05:
significance_boost = 5
else:
significance_boost = 0
# Boost for sample size
if sample_size > 100:
sample_boost = 10
elif sample_size > 50:
sample_boost = 5
else:
sample_boost = 0
confidence = base + significance_boost + sample_boost
return int(max(0, min(100, confidence)))
def _create_insight(
self,
tenant_id: str,
inventory_product_id: str,
insight_type: str,
category: str,
priority: str,
title: str,
description: str,
confidence: int,
metrics: Dict[str, Any],
actionable: bool,
actions: List[Dict[str, str]] = None,
impact_type: str = None,
impact_value: float = None,
impact_unit: str = None
) -> Dict[str, Any]:
"""Create an insight dictionary for AI Insights Service."""
return {
'tenant_id': tenant_id,
'type': insight_type,
'priority': priority,
'category': category,
'title': title,
'description': description,
'impact_type': impact_type,
'impact_value': impact_value,
'impact_unit': impact_unit,
'confidence': confidence,
'metrics_json': metrics,
'actionable': actionable,
'recommendation_actions': actions or [],
'source_service': 'forecasting',
'source_data_id': f'pattern_detection_{inventory_product_id}_{datetime.utcnow().strftime("%Y%m%d")}'
}