bakery-ia/services/training/Dockerfile

# Training Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/

# Then your main service stage
FROM python:3.11-slim

WORKDIR /app

# Install system dependencies including cmdstan requirements
RUN apt-get update && apt-get install -y \
    gcc \
    g++ \
    make \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements
COPY shared/requirements-tracing.txt /tmp/

COPY services/training/requirements.txt .

# Install Python dependencies
RUN pip install --no-cache-dir -r /tmp/requirements-tracing.txt

RUN pip install --no-cache-dir -r requirements.txt

# Copy shared libraries from the shared stage
COPY --from=shared /shared /app/shared

# Copy application code
COPY services/training/ .


# Add shared libraries to Python path
ENV PYTHONPATH="/app:/app/shared:${PYTHONPATH:-}"

# Set TMPDIR for cmdstan (directory will be created at runtime)
ENV TMPDIR=/tmp/cmdstan

# Install cmdstan for Prophet (required for model optimization)
# Suppress verbose output to reduce log noise
RUN python -m pip install --no-cache-dir cmdstanpy && \
    python -m cmdstanpy.install_cmdstan

# Expose port
EXPOSE 8000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1

# Run application with increased WebSocket ping timeout to handle long training operations
# Default uvicorn ws-ping-timeout is 20s, increasing to 300s (5 minutes) to prevent
# premature disconnections during CPU-intensive ML training (typically 2-3 minutes)
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--ws-ping-timeout", "300"]