FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04 AS builder

RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.10 python3-pip ffmpeg \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY pyproject.toml .
RUN pip install --no-cache-dir -e ".[inference,rag]" && \
    pip install --no-cache-dir grpcio-tools grpcio-health-checking scipy

# Generate gRPC code in builder stage
COPY proto/ ./proto/
RUN mkdir -p inference/generated && \
    python3 -m grpc_tools.protoc \
    -I proto/ \
    --python_out=inference/generated \
    --grpc_python_out=inference/generated \
    proto/*.proto

FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04

RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.10 python3-pip ffmpeg \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY --from=builder /usr/local/lib/python3.10 /usr/local/lib/python3.10
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /app/inference/generated ./inference/generated/

COPY pyproject.toml .
COPY inference/ ./inference/
COPY models/flash_head/ ./models/flash_head/
COPY cyberverse_config.yaml .

HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
    CMD python3 -c "import grpc; ch = grpc.insecure_channel('localhost:50051'); grpc.channel_ready_future(ch).result(timeout=3)" || exit 1

EXPOSE 50051
CMD ["python3", "-m", "inference.server", "--config", "cyberverse_config.yaml"]