234 lines
7.5 KiB
Python
234 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate realistic one-year bakery sales data for AI model training
|
|
Creates daily sales data with proper patterns, seasonality, and realistic variations
|
|
Pure Python - no external dependencies
|
|
"""
|
|
|
|
import csv
|
|
import random
|
|
from datetime import datetime, timedelta
|
|
from math import sqrt
|
|
|
|
# Set random seed for reproducibility
|
|
random.seed(42)
|
|
|
|
# Products with base quantities and prices
|
|
PRODUCTS = {
|
|
'pan': {'base_qty': 200, 'price': 1.20, 'weekend_factor': 0.85, 'holiday_factor': 1.30},
|
|
'croissant': {'base_qty': 110, 'price': 1.50, 'weekend_factor': 1.20, 'holiday_factor': 1.25},
|
|
'napolitana': {'base_qty': 75, 'price': 1.80, 'weekend_factor': 1.15, 'holiday_factor': 1.20},
|
|
'palmera': {'base_qty': 50, 'price': 1.60, 'weekend_factor': 1.25, 'holiday_factor': 1.15},
|
|
'cafe': {'base_qty': 280, 'price': 1.40, 'weekend_factor': 0.75, 'holiday_factor': 0.90}
|
|
}
|
|
|
|
# Spanish holidays in 2025
|
|
HOLIDAYS = [
|
|
'2025-01-01', # Año Nuevo
|
|
'2025-01-06', # Reyes
|
|
'2025-04-18', # Viernes Santo
|
|
'2025-05-01', # Día del Trabajo
|
|
'2025-08-15', # Asunción
|
|
'2025-10-12', # Fiesta Nacional
|
|
'2025-11-01', # Todos los Santos
|
|
'2025-12-06', # Constitución
|
|
'2025-12-08', # Inmaculada
|
|
'2025-12-25', # Navidad
|
|
]
|
|
|
|
def random_normal(mean=0, std=1):
|
|
"""Generate random number from normal distribution using Box-Muller transform"""
|
|
u1 = random.random()
|
|
u2 = random.random()
|
|
z0 = sqrt(-2.0 * 0.693147 * u1) * (2.0 * 3.14159 * u2)**0.5 # Simplified
|
|
return mean + z0 * std
|
|
|
|
def get_temperature(date):
|
|
"""Get realistic temperature for Madrid based on month"""
|
|
month = date.month
|
|
base_temps = {
|
|
1: 8, 2: 10, 3: 13, 4: 16, 5: 20, 6: 26,
|
|
7: 30, 8: 30, 9: 25, 10: 18, 11: 12, 12: 9
|
|
}
|
|
base = base_temps[month]
|
|
variation = random.uniform(-4, 4)
|
|
return round(max(0, base + variation), 1)
|
|
|
|
def get_precipitation(date, temperature):
|
|
"""Get precipitation (mm) - more likely in cooler months"""
|
|
month = date.month
|
|
# Higher chance of rain in winter/spring
|
|
rain_probability = {
|
|
1: 0.25, 2: 0.25, 3: 0.20, 4: 0.25, 5: 0.20, 6: 0.10,
|
|
7: 0.05, 8: 0.05, 9: 0.15, 10: 0.20, 11: 0.25, 12: 0.25
|
|
}
|
|
|
|
if random.random() < rain_probability[month]:
|
|
# Rain amount in mm
|
|
return round(random.uniform(2, 25), 1)
|
|
return 0
|
|
|
|
def calculate_quantity(product_name, product_info, date, is_weekend, is_holiday, temperature, precipitation):
|
|
"""Calculate realistic quantity sold with various factors"""
|
|
base = product_info['base_qty']
|
|
|
|
# Weekend adjustment
|
|
if is_weekend:
|
|
base *= product_info['weekend_factor']
|
|
|
|
# Holiday adjustment
|
|
if is_holiday:
|
|
base *= product_info['holiday_factor']
|
|
|
|
# Seasonal adjustment
|
|
month = date.month
|
|
if month in [12, 1]: # Christmas/New Year boost
|
|
base *= 1.15
|
|
elif month in [7, 8]: # Summer vacation dip
|
|
base *= 0.90
|
|
elif month in [4, 5, 9, 10]: # Spring/Fall moderate
|
|
base *= 1.05
|
|
|
|
# Temperature effect
|
|
if product_name == 'cafe':
|
|
# More coffee when cold
|
|
if temperature < 12:
|
|
base *= 1.15
|
|
elif temperature > 28:
|
|
base *= 0.85
|
|
else:
|
|
# Pastries sell better in moderate weather
|
|
if 15 <= temperature <= 25:
|
|
base *= 1.05
|
|
elif temperature > 30:
|
|
base *= 0.90
|
|
|
|
# Precipitation effect (rainy days reduce sales slightly)
|
|
if precipitation > 5:
|
|
base *= 0.85
|
|
elif precipitation > 15:
|
|
base *= 0.75
|
|
|
|
# Day of week pattern (Mon-Sun)
|
|
day_of_week = date.weekday()
|
|
day_factors = [0.95, 1.00, 1.05, 1.00, 1.10, 1.15, 1.05] # Mon to Sun
|
|
base *= day_factors[day_of_week]
|
|
|
|
# Add random variation (±15%)
|
|
variation = random.uniform(0.85, 1.15)
|
|
quantity = int(base * variation)
|
|
|
|
# Ensure minimum sales
|
|
min_qty = {
|
|
'pan': 80, 'croissant': 40, 'napolitana': 30,
|
|
'palmera': 20, 'cafe': 100
|
|
}
|
|
quantity = max(min_qty[product_name], quantity)
|
|
|
|
# Add occasional low-sales days (5% chance)
|
|
if random.random() < 0.05:
|
|
quantity = int(quantity * random.uniform(0.3, 0.6))
|
|
|
|
return quantity
|
|
|
|
def generate_dataset():
|
|
"""Generate complete one-year bakery sales dataset"""
|
|
start_date = datetime(2024, 9, 1)
|
|
end_date = datetime(2025, 9, 1)
|
|
|
|
records = []
|
|
current_date = start_date
|
|
|
|
print("Generating one year of bakery sales data...")
|
|
print(f"Date range: {start_date.date()} to {end_date.date()}")
|
|
print(f"Products: {list(PRODUCTS.keys())}")
|
|
|
|
# Statistics tracking
|
|
product_stats = {p: {'total': 0, 'min': float('inf'), 'max': 0, 'count': 0, 'zeros': 0}
|
|
for p in PRODUCTS.keys()}
|
|
|
|
while current_date <= end_date:
|
|
# Date properties
|
|
is_weekend = current_date.weekday() >= 5 # Saturday=5, Sunday=6
|
|
is_holiday = current_date.strftime('%Y-%m-%d') in HOLIDAYS
|
|
|
|
# Environmental factors
|
|
temperature = get_temperature(current_date)
|
|
precipitation = get_precipitation(current_date, temperature)
|
|
|
|
# Generate sales for each product
|
|
for product_name, product_info in PRODUCTS.items():
|
|
quantity = calculate_quantity(
|
|
product_name, product_info, current_date,
|
|
is_weekend, is_holiday, temperature, precipitation
|
|
)
|
|
|
|
revenue = round(quantity * product_info['price'], 2)
|
|
|
|
records.append({
|
|
'date': current_date.strftime('%Y-%m-%d'),
|
|
'product_name': product_name,
|
|
'quantity_sold': quantity,
|
|
'revenue': revenue
|
|
})
|
|
|
|
# Update statistics
|
|
stats = product_stats[product_name]
|
|
stats['total'] += quantity
|
|
stats['min'] = min(stats['min'], quantity)
|
|
stats['max'] = max(stats['max'], quantity)
|
|
stats['count'] += 1
|
|
if quantity == 0:
|
|
stats['zeros'] += 1
|
|
|
|
current_date += timedelta(days=1)
|
|
|
|
# Calculate days
|
|
total_days = (end_date - start_date).days + 1
|
|
|
|
# Print statistics
|
|
print(f"\nDataset generated successfully!")
|
|
print(f"Total records: {len(records)}")
|
|
print(f"Days: {total_days}")
|
|
print(f"Products: {len(PRODUCTS)}")
|
|
|
|
print("\nSales statistics by product:")
|
|
for product in PRODUCTS.keys():
|
|
stats = product_stats[product]
|
|
avg = stats['total'] / stats['count'] if stats['count'] > 0 else 0
|
|
zero_pct = (stats['zeros'] / stats['count'] * 100) if stats['count'] > 0 else 0
|
|
print(f" {product}:")
|
|
print(f" Total sold: {stats['total']:,}")
|
|
print(f" Avg daily: {avg:.1f}")
|
|
print(f" Min daily: {stats['min']}")
|
|
print(f" Max daily: {stats['max']}")
|
|
print(f" Zero days: {stats['zeros']} ({zero_pct:.1f}%)")
|
|
|
|
return records
|
|
|
|
if __name__ == '__main__':
|
|
# Generate dataset
|
|
records = generate_dataset()
|
|
|
|
# Save to CSV
|
|
output_file = '/Users/urtzialfaro/Downloads/bakery_data_2025_complete.csv'
|
|
|
|
with open(output_file, 'w', newline='') as csvfile:
|
|
fieldnames = ['date', 'product_name', 'quantity_sold', 'revenue']
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
|
|
writer.writeheader()
|
|
for record in records:
|
|
writer.writerow(record)
|
|
|
|
print(f"\nDataset saved to: {output_file}")
|
|
|
|
# Show sample
|
|
print("\nFirst 10 records:")
|
|
for i, record in enumerate(records[:10]):
|
|
print(f" {record}")
|
|
|
|
print("\nLast 10 records:")
|
|
for i, record in enumerate(records[-10:]):
|
|
print(f" {record}")
|