[DOC] Add Kubernetes deployment guide with CPUs (#14865)
This commit is contained in:
parent
5eeadc2642
commit
3eb08ed9b1
@ -85,6 +85,7 @@ html_static_path = ["_static"]
|
||||
html_js_files = ["custom.js"]
|
||||
html_css_files = ["custom.css"]
|
||||
|
||||
myst_heading_anchors = 2
|
||||
myst_url_schemes = {
|
||||
'http': None,
|
||||
'https': None,
|
||||
|
@ -4,6 +4,9 @@
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||
|
||||
* [Deployment with CPUs](#deployment-with-cpus)
|
||||
* [Deployment with GPUs](#deployment-with-gpus)
|
||||
|
||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [Helm](frameworks/helm.md)
|
||||
* [InftyAI/llmaz](integrations/llmaz.md)
|
||||
@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
* [vllm-project/production-stack](integrations/production-stack.md)
|
||||
|
||||
## Pre-requisite
|
||||
## Deployment with CPUs
|
||||
|
||||
Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
:::{note}
|
||||
The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
|
||||
:::
|
||||
|
||||
## Deployment using native K8s
|
||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
```
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
```bash
|
||||
cat <<EOF |kubectl apply -f -
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
||||
EOF
|
||||
```
|
||||
|
||||
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
|
||||
|
||||
```console
|
||||
kubectl logs -l app.kubernetes.io/name=vllm
|
||||
...
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
|
||||
## Deployment with GPUs
|
||||
|
||||
**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
|
||||
|
||||
1. Create a PVC, Secret and Deployment for vLLM
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user