#!/usr/bin/env python3 """ Generate realistic one-year bakery sales data for AI model training Creates daily sales data with proper patterns, seasonality, and realistic variations Pure Python - no external dependencies """ import csv import random from datetime import datetime, timedelta from math import sqrt # Set random seed for reproducibility random.seed(42) # Products with base quantities and prices PRODUCTS = { 'pan': {'base_qty': 200, 'price': 1.20, 'weekend_factor': 0.85, 'holiday_factor': 1.30}, 'croissant': {'base_qty': 110, 'price': 1.50, 'weekend_factor': 1.20, 'holiday_factor': 1.25}, 'napolitana': {'base_qty': 75, 'price': 1.80, 'weekend_factor': 1.15, 'holiday_factor': 1.20}, 'palmera': {'base_qty': 50, 'price': 1.60, 'weekend_factor': 1.25, 'holiday_factor': 1.15}, 'cafe': {'base_qty': 280, 'price': 1.40, 'weekend_factor': 0.75, 'holiday_factor': 0.90} } # Spanish holidays in 2025 HOLIDAYS = [ '2025-01-01', # Año Nuevo '2025-01-06', # Reyes '2025-04-18', # Viernes Santo '2025-05-01', # Día del Trabajo '2025-08-15', # Asunción '2025-10-12', # Fiesta Nacional '2025-11-01', # Todos los Santos '2025-12-06', # Constitución '2025-12-08', # Inmaculada '2025-12-25', # Navidad ] def random_normal(mean=0, std=1): """Generate random number from normal distribution using Box-Muller transform""" u1 = random.random() u2 = random.random() z0 = sqrt(-2.0 * 0.693147 * u1) * (2.0 * 3.14159 * u2)**0.5 # Simplified return mean + z0 * std def get_temperature(date): """Get realistic temperature for Madrid based on month""" month = date.month base_temps = { 1: 8, 2: 10, 3: 13, 4: 16, 5: 20, 6: 26, 7: 30, 8: 30, 9: 25, 10: 18, 11: 12, 12: 9 } base = base_temps[month] variation = random.uniform(-4, 4) return round(max(0, base + variation), 1) def get_precipitation(date, temperature): """Get precipitation (mm) - more likely in cooler months""" month = date.month # Higher chance of rain in winter/spring rain_probability = { 1: 0.25, 2: 0.25, 3: 0.20, 4: 0.25, 5: 0.20, 6: 0.10, 7: 0.05, 8: 0.05, 9: 0.15, 10: 0.20, 11: 0.25, 12: 0.25 } if random.random() < rain_probability[month]: # Rain amount in mm return round(random.uniform(2, 25), 1) return 0 def calculate_quantity(product_name, product_info, date, is_weekend, is_holiday, temperature, precipitation): """Calculate realistic quantity sold with various factors""" base = product_info['base_qty'] # Weekend adjustment if is_weekend: base *= product_info['weekend_factor'] # Holiday adjustment if is_holiday: base *= product_info['holiday_factor'] # Seasonal adjustment month = date.month if month in [12, 1]: # Christmas/New Year boost base *= 1.15 elif month in [7, 8]: # Summer vacation dip base *= 0.90 elif month in [4, 5, 9, 10]: # Spring/Fall moderate base *= 1.05 # Temperature effect if product_name == 'cafe': # More coffee when cold if temperature < 12: base *= 1.15 elif temperature > 28: base *= 0.85 else: # Pastries sell better in moderate weather if 15 <= temperature <= 25: base *= 1.05 elif temperature > 30: base *= 0.90 # Precipitation effect (rainy days reduce sales slightly) if precipitation > 5: base *= 0.85 elif precipitation > 15: base *= 0.75 # Day of week pattern (Mon-Sun) day_of_week = date.weekday() day_factors = [0.95, 1.00, 1.05, 1.00, 1.10, 1.15, 1.05] # Mon to Sun base *= day_factors[day_of_week] # Add random variation (±15%) variation = random.uniform(0.85, 1.15) quantity = int(base * variation) # Ensure minimum sales min_qty = { 'pan': 80, 'croissant': 40, 'napolitana': 30, 'palmera': 20, 'cafe': 100 } quantity = max(min_qty[product_name], quantity) # Add occasional low-sales days (5% chance) if random.random() < 0.05: quantity = int(quantity * random.uniform(0.3, 0.6)) return quantity def generate_dataset(): """Generate complete one-year bakery sales dataset""" start_date = datetime(2024, 9, 1) end_date = datetime(2025, 9, 1) records = [] current_date = start_date print("Generating one year of bakery sales data...") print(f"Date range: {start_date.date()} to {end_date.date()}") print(f"Products: {list(PRODUCTS.keys())}") # Statistics tracking product_stats = {p: {'total': 0, 'min': float('inf'), 'max': 0, 'count': 0, 'zeros': 0} for p in PRODUCTS.keys()} while current_date <= end_date: # Date properties is_weekend = current_date.weekday() >= 5 # Saturday=5, Sunday=6 is_holiday = current_date.strftime('%Y-%m-%d') in HOLIDAYS # Environmental factors temperature = get_temperature(current_date) precipitation = get_precipitation(current_date, temperature) # Generate sales for each product for product_name, product_info in PRODUCTS.items(): quantity = calculate_quantity( product_name, product_info, current_date, is_weekend, is_holiday, temperature, precipitation ) revenue = round(quantity * product_info['price'], 2) records.append({ 'date': current_date.strftime('%Y-%m-%d'), 'product_name': product_name, 'quantity_sold': quantity, 'revenue': revenue }) # Update statistics stats = product_stats[product_name] stats['total'] += quantity stats['min'] = min(stats['min'], quantity) stats['max'] = max(stats['max'], quantity) stats['count'] += 1 if quantity == 0: stats['zeros'] += 1 current_date += timedelta(days=1) # Calculate days total_days = (end_date - start_date).days + 1 # Print statistics print(f"\nDataset generated successfully!") print(f"Total records: {len(records)}") print(f"Days: {total_days}") print(f"Products: {len(PRODUCTS)}") print("\nSales statistics by product:") for product in PRODUCTS.keys(): stats = product_stats[product] avg = stats['total'] / stats['count'] if stats['count'] > 0 else 0 zero_pct = (stats['zeros'] / stats['count'] * 100) if stats['count'] > 0 else 0 print(f" {product}:") print(f" Total sold: {stats['total']:,}") print(f" Avg daily: {avg:.1f}") print(f" Min daily: {stats['min']}") print(f" Max daily: {stats['max']}") print(f" Zero days: {stats['zeros']} ({zero_pct:.1f}%)") return records if __name__ == '__main__': # Generate dataset records = generate_dataset() # Save to CSV output_file = '/Users/urtzialfaro/Downloads/bakery_data_2025_complete.csv' with open(output_file, 'w', newline='') as csvfile: fieldnames = ['date', 'product_name', 'quantity_sold', 'revenue'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for record in records: writer.writerow(record) print(f"\nDataset saved to: {output_file}") # Show sample print("\nFirst 10 records:") for i, record in enumerate(records[:10]): print(f" {record}") print("\nLast 10 records:") for i, record in enumerate(records[-10:]): print(f" {record}")