2025-07-19 16:59:37 +02:00
|
|
|
# services/training/app/models/training.py
|
2025-07-17 14:34:24 +02:00
|
|
|
"""
|
2025-07-19 16:59:37 +02:00
|
|
|
Database models for training service
|
2025-07-17 14:34:24 +02:00
|
|
|
"""
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, JSON, Float
|
|
|
|
|
from sqlalchemy.dialects.postgresql import UUID, ARRAY
|
2025-07-25 20:01:37 +02:00
|
|
|
from shared.database.base import Base
|
2025-08-08 09:08:41 +02:00
|
|
|
from datetime import datetime, timezone
|
2025-07-17 14:34:24 +02:00
|
|
|
import uuid
|
|
|
|
|
|
2025-07-27 10:01:37 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
class ModelTrainingLog(Base):
|
|
|
|
|
"""
|
|
|
|
|
Table to track training job execution and status.
|
|
|
|
|
Replaces the old Celery task tracking.
|
|
|
|
|
"""
|
|
|
|
|
__tablename__ = "model_training_logs"
|
|
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
|
job_id = Column(String(255), unique=True, index=True, nullable=False)
|
2025-07-27 10:01:37 +02:00
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-07-19 16:59:37 +02:00
|
|
|
status = Column(String(50), nullable=False, default="pending") # pending, running, completed, failed, cancelled
|
|
|
|
|
progress = Column(Integer, default=0) # 0-100 percentage
|
|
|
|
|
current_step = Column(String(500), default="")
|
|
|
|
|
|
|
|
|
|
# Timestamps
|
2025-08-08 09:08:41 +02:00
|
|
|
start_time = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
end_time = Column(DateTime(timezone=True), nullable=True)
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
# Configuration and results
|
|
|
|
|
config = Column(JSON, nullable=True) # Training job configuration
|
|
|
|
|
results = Column(JSON, nullable=True) # Training results
|
|
|
|
|
error_message = Column(Text, nullable=True)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
|
|
|
|
# Metadata
|
2025-08-08 09:08:41 +02:00
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
updated_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc))
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
class ModelPerformanceMetric(Base):
|
|
|
|
|
"""
|
|
|
|
|
Table to track model performance over time.
|
|
|
|
|
"""
|
|
|
|
|
__tablename__ = "model_performance_metrics"
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
|
model_id = Column(String(255), index=True, nullable=False)
|
2025-07-27 10:01:37 +02:00
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-08-14 16:47:34 +02:00
|
|
|
inventory_product_id = Column(UUID(as_uuid=True), index=True, nullable=False)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
|
|
|
|
# Performance metrics
|
2025-07-19 16:59:37 +02:00
|
|
|
mae = Column(Float, nullable=True) # Mean Absolute Error
|
|
|
|
|
mse = Column(Float, nullable=True) # Mean Squared Error
|
|
|
|
|
rmse = Column(Float, nullable=True) # Root Mean Squared Error
|
|
|
|
|
mape = Column(Float, nullable=True) # Mean Absolute Percentage Error
|
|
|
|
|
r2_score = Column(Float, nullable=True) # R-squared score
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Additional metrics
|
|
|
|
|
accuracy_percentage = Column(Float, nullable=True)
|
|
|
|
|
prediction_confidence = Column(Float, nullable=True)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Evaluation information
|
|
|
|
|
evaluation_period_start = Column(DateTime, nullable=True)
|
|
|
|
|
evaluation_period_end = Column(DateTime, nullable=True)
|
|
|
|
|
evaluation_samples = Column(Integer, nullable=True)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Metadata
|
2025-08-08 09:08:41 +02:00
|
|
|
measured_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
class TrainingJobQueue(Base):
|
|
|
|
|
"""
|
|
|
|
|
Table to manage training job queue and scheduling.
|
|
|
|
|
"""
|
|
|
|
|
__tablename__ = "training_job_queue"
|
|
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
|
job_id = Column(String(255), unique=True, index=True, nullable=False)
|
2025-07-27 10:01:37 +02:00
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
# Job configuration
|
|
|
|
|
job_type = Column(String(50), nullable=False) # full_training, single_product, evaluation
|
|
|
|
|
priority = Column(Integer, default=1) # Higher number = higher priority
|
|
|
|
|
config = Column(JSON, nullable=True)
|
|
|
|
|
|
|
|
|
|
# Scheduling information
|
|
|
|
|
scheduled_at = Column(DateTime, nullable=True)
|
|
|
|
|
started_at = Column(DateTime, nullable=True)
|
|
|
|
|
estimated_duration_minutes = Column(Integer, nullable=True)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Status
|
|
|
|
|
status = Column(String(50), nullable=False, default="queued") # queued, running, completed, failed
|
|
|
|
|
retry_count = Column(Integer, default=0)
|
|
|
|
|
max_retries = Column(Integer, default=3)
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Metadata
|
2025-08-08 09:08:41 +02:00
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
updated_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc))
|
2025-08-02 17:09:53 +02:00
|
|
|
cancelled_by = Column(String, nullable=True)
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
class ModelArtifact(Base):
|
|
|
|
|
"""
|
|
|
|
|
Table to track model files and artifacts.
|
|
|
|
|
"""
|
|
|
|
|
__tablename__ = "model_artifacts"
|
|
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True, index=True)
|
|
|
|
|
model_id = Column(String(255), index=True, nullable=False)
|
2025-07-27 10:01:37 +02:00
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
# Artifact information
|
|
|
|
|
artifact_type = Column(String(50), nullable=False) # model_file, metadata, training_data, etc.
|
|
|
|
|
file_path = Column(String(1000), nullable=False)
|
|
|
|
|
file_size_bytes = Column(Integer, nullable=True)
|
|
|
|
|
checksum = Column(String(255), nullable=True) # For file integrity
|
|
|
|
|
|
|
|
|
|
# Storage information
|
|
|
|
|
storage_location = Column(String(100), nullable=False, default="local") # local, s3, gcs, etc.
|
|
|
|
|
compression = Column(String(50), nullable=True) # gzip, lz4, etc.
|
2025-07-17 14:34:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Metadata
|
2025-08-08 09:08:41 +02:00
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
expires_at = Column(DateTime(timezone=True), nullable=True) # For automatic cleanup
|
2025-07-28 19:28:39 +02:00
|
|
|
|
|
|
|
|
class TrainedModel(Base):
|
|
|
|
|
__tablename__ = "trained_models"
|
|
|
|
|
|
2025-08-08 09:08:41 +02:00
|
|
|
# Primary identification - Updated to use UUID properly
|
|
|
|
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-08-14 16:47:34 +02:00
|
|
|
inventory_product_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
2025-07-28 19:28:39 +02:00
|
|
|
|
|
|
|
|
# Model information
|
|
|
|
|
model_type = Column(String, default="prophet_optimized")
|
|
|
|
|
model_version = Column(String, default="1.0")
|
|
|
|
|
job_id = Column(String, nullable=False)
|
|
|
|
|
|
|
|
|
|
# File storage
|
|
|
|
|
model_path = Column(String, nullable=False) # Path to the .pkl file
|
|
|
|
|
metadata_path = Column(String) # Path to metadata JSON
|
|
|
|
|
|
|
|
|
|
# Training metrics
|
|
|
|
|
mape = Column(Float)
|
|
|
|
|
mae = Column(Float)
|
|
|
|
|
rmse = Column(Float)
|
|
|
|
|
r2_score = Column(Float)
|
|
|
|
|
training_samples = Column(Integer)
|
|
|
|
|
|
|
|
|
|
# Hyperparameters and features
|
|
|
|
|
hyperparameters = Column(JSON) # Store optimized parameters
|
|
|
|
|
features_used = Column(JSON) # List of regressor columns
|
2025-08-12 18:17:30 +02:00
|
|
|
normalization_params = Column(JSON) # Store feature normalization parameters for consistent predictions
|
2025-11-05 13:34:56 +01:00
|
|
|
product_category = Column(String, nullable=True) # Product category for category-specific forecasting
|
|
|
|
|
|
2025-07-28 19:28:39 +02:00
|
|
|
# Model status
|
|
|
|
|
is_active = Column(Boolean, default=True)
|
|
|
|
|
is_production = Column(Boolean, default=False)
|
|
|
|
|
|
2025-08-08 09:08:41 +02:00
|
|
|
# Timestamps - Updated to be timezone-aware with proper defaults
|
|
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
updated_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc), onupdate=lambda: datetime.now(timezone.utc))
|
|
|
|
|
last_used_at = Column(DateTime(timezone=True))
|
2025-07-28 19:28:39 +02:00
|
|
|
|
|
|
|
|
# Training data info
|
2025-08-08 09:08:41 +02:00
|
|
|
training_start_date = Column(DateTime(timezone=True))
|
|
|
|
|
training_end_date = Column(DateTime(timezone=True))
|
2025-07-28 19:28:39 +02:00
|
|
|
data_quality_score = Column(Float)
|
|
|
|
|
|
|
|
|
|
# Additional metadata
|
|
|
|
|
notes = Column(Text)
|
|
|
|
|
created_by = Column(String) # User who triggered training
|
|
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
|
return {
|
2025-08-08 09:08:41 +02:00
|
|
|
"id": str(self.id),
|
|
|
|
|
"model_id": str(self.id),
|
|
|
|
|
"tenant_id": str(self.tenant_id),
|
2025-08-14 16:47:34 +02:00
|
|
|
"inventory_product_id": str(self.inventory_product_id),
|
2025-07-28 19:28:39 +02:00
|
|
|
"model_type": self.model_type,
|
|
|
|
|
"model_version": self.model_version,
|
|
|
|
|
"model_path": self.model_path,
|
|
|
|
|
"mape": self.mape,
|
|
|
|
|
"mae": self.mae,
|
|
|
|
|
"rmse": self.rmse,
|
|
|
|
|
"r2_score": self.r2_score,
|
|
|
|
|
"training_samples": self.training_samples,
|
|
|
|
|
"hyperparameters": self.hyperparameters,
|
|
|
|
|
"features_used": self.features_used,
|
2025-11-14 20:27:39 +01:00
|
|
|
"features": self.features_used, # Alias for frontend compatibility (ModelDetailsModal expects 'features')
|
2025-11-05 13:34:56 +01:00
|
|
|
"product_category": self.product_category,
|
2025-07-28 19:28:39 +02:00
|
|
|
"is_active": self.is_active,
|
|
|
|
|
"is_production": self.is_production,
|
|
|
|
|
"created_at": self.created_at.isoformat() if self.created_at else None,
|
2025-08-08 09:08:41 +02:00
|
|
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
2025-07-28 19:28:39 +02:00
|
|
|
"last_used_at": self.last_used_at.isoformat() if self.last_used_at else None,
|
|
|
|
|
"training_start_date": self.training_start_date.isoformat() if self.training_start_date else None,
|
|
|
|
|
"training_end_date": self.training_end_date.isoformat() if self.training_end_date else None,
|
|
|
|
|
"data_quality_score": self.data_quality_score
|
2025-10-15 16:12:49 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TrainingPerformanceMetrics(Base):
|
|
|
|
|
"""
|
|
|
|
|
Table to track historical training performance for time estimation.
|
|
|
|
|
Stores aggregated metrics from completed training jobs.
|
|
|
|
|
"""
|
|
|
|
|
__tablename__ = "training_performance_metrics"
|
|
|
|
|
|
|
|
|
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
|
|
|
|
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
|
|
|
|
|
job_id = Column(String(255), nullable=False, index=True)
|
|
|
|
|
|
|
|
|
|
# Training job statistics
|
|
|
|
|
total_products = Column(Integer, nullable=False)
|
|
|
|
|
successful_products = Column(Integer, nullable=False)
|
|
|
|
|
failed_products = Column(Integer, nullable=False)
|
|
|
|
|
|
|
|
|
|
# Time metrics
|
|
|
|
|
total_duration_seconds = Column(Float, nullable=False)
|
|
|
|
|
avg_time_per_product = Column(Float, nullable=False) # Key metric for estimation
|
|
|
|
|
data_analysis_time_seconds = Column(Float, nullable=True)
|
|
|
|
|
training_time_seconds = Column(Float, nullable=True)
|
|
|
|
|
finalization_time_seconds = Column(Float, nullable=True)
|
|
|
|
|
|
|
|
|
|
# Job metadata
|
|
|
|
|
completed_at = Column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return (
|
|
|
|
|
f"<TrainingPerformanceMetrics("
|
|
|
|
|
f"tenant_id={self.tenant_id}, "
|
|
|
|
|
f"job_id={self.job_id}, "
|
|
|
|
|
f"total_products={self.total_products}, "
|
|
|
|
|
f"avg_time_per_product={self.avg_time_per_product:.2f}s"
|
|
|
|
|
f")>"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
|
return {
|
|
|
|
|
"id": str(self.id),
|
|
|
|
|
"tenant_id": str(self.tenant_id),
|
|
|
|
|
"job_id": self.job_id,
|
|
|
|
|
"total_products": self.total_products,
|
|
|
|
|
"successful_products": self.successful_products,
|
|
|
|
|
"failed_products": self.failed_products,
|
|
|
|
|
"total_duration_seconds": self.total_duration_seconds,
|
|
|
|
|
"avg_time_per_product": self.avg_time_per_product,
|
|
|
|
|
"data_analysis_time_seconds": self.data_analysis_time_seconds,
|
|
|
|
|
"training_time_seconds": self.training_time_seconds,
|
|
|
|
|
"finalization_time_seconds": self.finalization_time_seconds,
|
|
|
|
|
"completed_at": self.completed_at.isoformat() if self.completed_at else None,
|
|
|
|
|
"created_at": self.created_at.isoformat() if self.created_at else None
|
2025-07-28 19:28:39 +02:00
|
|
|
}
|