forked from huggingface/text-embeddings-inference
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDockerfile-cuda-all
144 lines (108 loc) · 4.67 KB
/
Dockerfile-cuda-all
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
ENV SCCACHE=0.5.4
ENV RUSTC_WRAPPER=/usr/local/bin/sccache
ENV PATH="/root/.cargo/bin:${PATH}"
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
curl \
libssl-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# Donwload and configure sccache
RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
chmod +x /usr/local/bin/sccache
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
RUN cargo install cargo-chef --locked
FROM base-builder AS planner
WORKDIR /usr/src
COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./
RUN cargo chef prepare --recipe-path recipe.json
FROM base-builder AS builder
ARG GIT_SHA
ARG DOCKER_LABEL
ARG VERTEX="false"
# sccache specific variables
ARG ACTIONS_CACHE_URL
ARG ACTIONS_RUNTIME_TOKEN
ARG SCCACHE_GHA_ENABLED
# Limit parallelism
ARG RAYON_NUM_THREADS=4
ARG CARGO_BUILD_JOBS
ARG CARGO_BUILD_INCREMENTAL
WORKDIR /usr/src
COPY --from=planner /usr/src/recipe.json recipe.json
RUN if [ $VERTEX = "true" ]; \
then \
cargo chef cook --release --features google --recipe-path recipe.json && sccache -s; \
else \
cargo chef cook --release --recipe-path recipe.json && sccache -s; \
fi;
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
else \
CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
fi;
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
else \
CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
fi;
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
else \
CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
fi;
COPY backends backends
COPY core core
COPY router router
COPY Cargo.toml ./
COPY Cargo.lock ./
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F google && sccache -s; \
else \
CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; \
fi;
RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F google && sccache -s; \
else \
CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
fi;
RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
RUN if [ $VERTEX = "true" ]; \
then \
CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F google && sccache -s; \
else \
CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
fi;
RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base
ARG DEFAULT_USE_FLASH_ATTENTION=True
ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80 \
USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
libssl-dev \
curl \
&& rm -rf /var/lib/apt/lists/*
COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local/bin/text-embeddings-router-75
COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
# Amazon SageMaker compatible image
FROM base AS sagemaker
COPY --chmod=775 sagemaker-entrypoint-cuda-all.sh entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
# Default image
FROM base
COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
CMD ["--json-output"]