#FROM python:3.11-slim-bookworm AS vector-model
#COPY installer/install_model.py install_model.py
#RUN pip3 install --upgrade pip setuptools && \
#    pip install pycrawlers && \
#    pip install transformers && \
#    python3 install_model.py && \
#    cp -r model/base/hub model/tokenizer
#FROM scratch
#COPY --from=vector-model model /opt/maxkb-app/model

# 不知道为什么用上面的脚本重新拉一遍向量模型比之前的大很多，所以还是用下面的脚本复用原来已经构建好的向量模型

FROM python:3.11-slim-bookworm AS tmp-stage1
COPY installer/install_model_bert_base_cased.py install_model_bert_base_cased.py
RUN pip3 install --upgrade pip setuptools && \
    pip install pycrawlers && \
    pip install transformers && \
    python3 install_model_bert_base_cased.py && \
    cp -r model/base/hub model/tokenizer

ARG NEBULA_VECTOR_MODEL_BASE_IMAGE=nebulakb/vector-model:v1.0.1
ARG LZKB_VECTOR_MODEL_BASE_IMAGE=${NEBULA_VECTOR_MODEL_BASE_IMAGE}
FROM ${LZKB_VECTOR_MODEL_BASE_IMAGE} AS vector-model

FROM alpine AS tmp-stage2
COPY --from=vector-model /opt/maxkb/app/model /opt/maxkb-app/model
COPY --from=vector-model /opt/maxkb/app/model/base/hub /opt/maxkb-app/model/tokenizer
COPY --from=tmp-stage1 model/tokenizer /opt/maxkb-app/model/tokenizer
RUN rm -rf /opt/maxkb-app/model/embedding/shibing624_text2vec-base-chinese/onnx
RUN apk add --update --no-cache curl && \
    mkdir -p openai-tiktoken-cl100k-base && \
    curl -Lf  https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken > openai-tiktoken-cl100k-base/cl100k_base.tiktoken && \
    mv -f openai-tiktoken-cl100k-base /opt/maxkb-app/model/tokenizer/


FROM scratch
COPY --from=tmp-stage2 /opt/maxkb-app/model /opt/maxkb-app/model
