Add example of helm chart for vllm deployment on k8s (#9199)

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
This commit is contained in:
Maxime Fournioux 2024-12-10 10:19:27 +01:00 committed by GitHub
parent 82c73fd510
commit fe2e10c71b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 1206 additions and 0 deletions

81
.github/workflows/lint-and-deploy.yaml vendored Normal file
View File

@ -0,0 +1,81 @@
name: Lint and Deploy Charts
on: pull_request
jobs:
lint-and-deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
with:
version: v3.14.4
#Python is required because ct lint runs Yamale and yamllint which require Python.
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.13'
- name: Set up chart-testing
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
with:
version: v3.10.1
- name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
- name: Setup minio
run: |
docker network create vllm-net
docker run -d -p 9000:9000 --name minio --net vllm-net \
-e "MINIO_ACCESS_KEY=minioadmin" \
-e "MINIO_SECRET_KEY=minioadmin" \
-v /tmp/data:/data \
-v /tmp/config:/root/.minio \
minio/minio server /data
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_EC2_METADATA_DISABLED=true
mkdir opt-125m
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
- name: Create kind cluster
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster
run: |
docker pull amazon/aws-cli:2.6.4
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
kind load docker-image vllm-cpu-env:latest --name chart-testing
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
kubectl create ns ns-vllm
- name: Run chart-testing (install)
run: |
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
run: |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
sleep 10
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
--header "Content-Type: application/json" \
--data '{
"model": "opt-125m",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'):$CODE"
echo "$CODE"

View File

@ -82,6 +82,7 @@ Documentation
serving/openai_compatible_server serving/openai_compatible_server
serving/deploying_with_docker serving/deploying_with_docker
serving/deploying_with_k8s serving/deploying_with_k8s
serving/deploying_with_helm
serving/deploying_with_nginx serving/deploying_with_nginx
serving/distributed_serving serving/distributed_serving
serving/metrics serving/metrics

Binary file not shown.

After

Width:  |  Height:  |  Size: 968 KiB

View File

@ -0,0 +1,253 @@
.. _deploying_with_helm:
Deploying with Helm
===================
A Helm chart to deploy vLLM for Kubernetes
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
Prerequisites
-------------
Before you begin, ensure that you have the following:
- A running Kubernetes cluster
- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin <https://github.com/NVIDIA/k8s-device-plugin>`__
- Available GPU resources in your cluster
- S3 with the model which will be deployed
Installing the chart
--------------------
To install the chart with the release name ``test-vllm``:
.. code-block:: console
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3buckername=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
Uninstalling the Chart
----------------------
To uninstall the ``test-vllm`` deployment:
.. code-block:: console
helm uninstall test-vllm --namespace=ns-vllm
The command removes all the Kubernetes components associated with the
chart **including persistent volumes** and deletes the release.
Architecture
------------
.. image:: architecture_helm_deployment.png
Values
------
.. list-table:: Values
:widths: 25 25 25 25
:header-rows: 1
* - Key
- Type
- Default
- Description
* - autoscaling
- object
- {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
- Autoscaling configuration
* - autoscaling.enabled
- bool
- false
- Enable autoscaling
* - autoscaling.maxReplicas
- int
- 100
- Maximum replicas
* - autoscaling.minReplicas
- int
- 1
- Minimum replicas
* - autoscaling.targetCPUUtilizationPercentage
- int
- 80
- Target CPU utilization for autoscaling
* - configs
- object
- {}
- Configmap
* - containerPort
- int
- 8000
- Container port
* - customObjects
- list
- []
- Custom Objects configuration
* - deploymentStrategy
- object
- {}
- Deployment strategy configuration
* - externalConfigs
- list
- []
- External configuration
* - extraContainers
- list
- []
- Additional containers configuration
* - extraInit
- object
- {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
- Additional configuration for the init container
* - extraInit.pvcStorage
- string
- "50Gi"
- Storage size of the s3
* - extraInit.s3modelpath
- string
- "relative_s3_model_path/opt-125m"
- Path of the model on the s3 which hosts model weights and config files
* - extraInit.awsEc2MetadataDisabled
- boolean
- true
- Disables the use of the Amazon EC2 instance metadata service
* - extraPorts
- list
- []
- Additional ports configuration
* - gpuModels
- list
- ["TYPE_GPU_USED"]
- Type of gpu used
* - image
- object
- {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
- Image configuration
* - image.command
- list
- ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
- Container launch command
* - image.repository
- string
- "vllm/vllm-openai"
- Image repository
* - image.tag
- string
- "latest"
- Image tag
* - livenessProbe
- object
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
- Liveness probe configuration
* - livenessProbe.failureThreshold
- int
- 3
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
* - livenessProbe.httpGet
- object
- {"path":"/health","port":8000}
- Configuration of the Kubelet http request on the server
* - livenessProbe.httpGet.path
- string
- "/health"
- Path to access on the HTTP server
* - livenessProbe.httpGet.port
- int
- 8000
- Name or number of the port to access on the container, on which the server is listening
* - livenessProbe.initialDelaySeconds
- int
- 15
- Number of seconds after the container has started before liveness probe is initiated
* - livenessProbe.periodSeconds
- int
- 10
- How often (in seconds) to perform the liveness probe
* - maxUnavailablePodDisruptionBudget
- string
- ""
- Disruption Budget Configuration
* - readinessProbe
- object
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
- Readiness probe configuration
* - readinessProbe.failureThreshold
- int
- 3
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
* - readinessProbe.httpGet
- object
- {"path":"/health","port":8000}
- Configuration of the Kubelet http request on the server
* - readinessProbe.httpGet.path
- string
- "/health"
- Path to access on the HTTP server
* - readinessProbe.httpGet.port
- int
- 8000
- Name or number of the port to access on the container, on which the server is listening
* - readinessProbe.initialDelaySeconds
- int
- 5
- Number of seconds after the container has started before readiness probe is initiated
* - readinessProbe.periodSeconds
- int
- 5
- How often (in seconds) to perform the readiness probe
* - replicaCount
- int
- 1
- Number of replicas
* - resources
- object
- {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
- Resource configuration
* - resources.limits."nvidia.com/gpu"
- int
- 1
- Number of gpus used
* - resources.limits.cpu
- int
- 4
- Number of CPUs
* - resources.limits.memory
- string
- "16Gi"
- CPU memory configuration
* - resources.requests."nvidia.com/gpu"
- int
- 1
- Number of gpus used
* - resources.requests.cpu
- int
- 4
- Number of CPUs
* - resources.requests.memory
- string
- "16Gi"
- CPU memory configuration
* - secrets
- object
- {}
- Secrets configuration
* - serviceName
- string
-
- Service name
* - servicePort
- int
- 80
- Service port
* - labels.environment
- string
- test
- Environment name
* - labels.release
- string
- test
- Release name

View File

@ -0,0 +1,6 @@
*.png
.git/
ct.yaml
lintconf.yaml
values.schema.json
/workflows

View File

@ -0,0 +1,21 @@
apiVersion: v2
name: chart-vllm
description: Chart vllm
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.0.1
maintainers:
- name: mfournioux

View File

@ -0,0 +1,3 @@
chart-dirs:
- charts
validate-maintainers: false

View File

@ -0,0 +1,42 @@
---
rules:
braces:
min-spaces-inside: 0
max-spaces-inside: 0
min-spaces-inside-empty: -1
max-spaces-inside-empty: -1
brackets:
min-spaces-inside: 0
max-spaces-inside: 0
min-spaces-inside-empty: -1
max-spaces-inside-empty: -1
colons:
max-spaces-before: 0
max-spaces-after: 1
commas:
max-spaces-before: 0
min-spaces-after: 1
max-spaces-after: 1
comments:
require-starting-space: true
min-spaces-from-content: 2
document-end: disable
document-start: disable # No --- to start a file
empty-lines:
max: 2
max-start: 0
max-end: 0
hyphens:
max-spaces-after: 1
indentation:
spaces: consistent
indent-sequences: whatever # - list indentation will handle both indentation and without
check-multi-line-strings: false
key-duplicates: enable
line-length: disable # Lines can be any length
new-line-at-end-of-file: disable
new-lines:
type: unix
trailing-spaces: enable
truthy:
level: warning

View File

@ -0,0 +1,164 @@
{{/*
Define ports for the pods
*/}}
{{- define "chart.container-port" -}}
{{- default "8000" .Values.containerPort }}
{{- end }}
{{/*
Define service name
*/}}
{{- define "chart.service-name" -}}
{{- if .Values.serviceName }}
{{- .Values.serviceName | lower | trim }}
{{- else }}
"{{ .Release.Name }}-service"
{{- end }}
{{- end }}
{{/*
Define service port
*/}}
{{- define "chart.service-port" -}}
{{- if .Values.servicePort }}
{{- .Values.servicePort }}
{{- else }}
{{- include "chart.container-port" . }}
{{- end }}
{{- end }}
{{/*
Define service port name
*/}}
{{- define "chart.service-port-name" -}}
"service-port"
{{- end }}
{{/*
Define container port name
*/}}
{{- define "chart.container-port-name" -}}
"container-port"
{{- end }}
{{/*
Define deployment strategy
*/}}
{{- define "chart.strategy" -}}
strategy:
{{- if not .Values.deploymentStrategy }}
rollingUpdate:
maxSurge: 100%
maxUnavailable: 0
{{- else }}
{{ toYaml .Values.deploymentStrategy | indent 2 }}
{{- end }}
{{- end }}
{{/*
Define additional ports
*/}}
{{- define "chart.extraPorts" }}
{{- with .Values.extraPorts }}
{{ toYaml . }}
{{- end }}
{{- end }}
{{/*
Define chart external ConfigMaps and Secrets
*/}}
{{- define "chart.externalConfigs" -}}
{{- with .Values.externalConfigs -}}
{{ toYaml . }}
{{- end }}
{{- end }}
{{/*
Define liveness et readiness probes
*/}}
{{- define "chart.probes" -}}
{{- if .Values.readinessProbe }}
readinessProbe:
{{- with .Values.readinessProbe }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- if .Values.livenessProbe }}
livenessProbe:
{{- with .Values.livenessProbe }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Define resources
*/}}
{{- define "chart.resources" -}}
requests:
memory: {{ required "Value 'resources.requests.memory' must be defined !" .Values.resources.requests.memory | quote }}
cpu: {{ required "Value 'resources.requests.cpu' must be defined !" .Values.resources.requests.cpu | quote }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
nvidia.com/gpu: {{ required "Value 'resources.requests.nvidia.com/gpu' must be defined !" (index .Values.resources.requests "nvidia.com/gpu") | quote }}
{{- end }}
limits:
memory: {{ required "Value 'resources.limits.memory' must be defined !" .Values.resources.limits.memory | quote }}
cpu: {{ required "Value 'resources.limits.cpu' must be defined !" .Values.resources.limits.cpu | quote }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
nvidia.com/gpu: {{ required "Value 'resources.limits.nvidia.com/gpu' must be defined !" (index .Values.resources.limits "nvidia.com/gpu") | quote }}
{{- end }}
{{- end }}
{{/*
Define User used for the main container
*/}}
{{- define "chart.user" }}
{{- if .Values.image.runAsUser }}
runAsUser:
{{- with .Values.runAsUser }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}
{{- end }}
{{- define "chart.extraInitImage" -}}
"amazon/aws-cli:2.6.4"
{{- end }}
{{- define "chart.extraInitEnv" -}}
- name: S3_ENDPOINT_URL
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3endpoint
- name: S3_BUCKET_NAME
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3bucketname
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3accesskeyid
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: {{ .Release.Name }}-secrets
key: s3accesskey
- name: S3_PATH
value: "{{ .Values.extraInit.s3modelpath }}"
- name: AWS_EC2_METADATA_DISABLED
value: "{{ .Values.extraInit.awsEc2MetadataDisabled }}"
{{- end }}
{{/*
Define chart labels
*/}}
{{- define "chart.labels" -}}
{{- with .Values.labels -}}
{{ toYaml . }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,11 @@
{{- if .Values.configs -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Release.Name }}-configs"
namespace: {{ .Release.Namespace }}
data:
{{- with .Values.configs }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end -}}

View File

@ -0,0 +1,6 @@
{{- if .Values.customObjects }}
{{- range .Values.customObjects }}
{{- tpl (. | toYaml) $ }}
---
{{- end }}
{{- end }}

View File

@ -0,0 +1,122 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ .Release.Name }}-deployment-vllm"
namespace: {{ .Release.Namespace }}
labels:
{{- include "chart.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.replicaCount }}
{{- include "chart.strategy" . | nindent 2 }}
selector:
matchLabels:
environment: "test"
release: "test"
progressDeadlineSeconds: 1200
template:
metadata:
labels:
environment: "test"
release: "test"
spec:
containers:
- name: "vllm"
image: "{{ required "Required value 'image.repository' must be defined !" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined !" .Values.image.tag }}"
{{- if .Values.image.command }}
command :
{{- with .Values.image.command }}
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}
securityContext:
{{- if .Values.image.securityContext }}
{{- with .Values.image.securityContext }}
{{- toYaml . | nindent 12 }}
{{- end }}
{{- else }}
runAsNonRoot: false
{{- include "chart.user" . | indent 12 }}
{{- end }}
imagePullPolicy: IfNotPresent
{{- if .Values.image.env }}
env :
{{- with .Values.image.env }}
{{- toYaml . | nindent 10 }}
{{- end }}
{{- else }}
env: []
{{- end }}
{{- if or .Values.externalConfigs .Values.configs .Values.secrets }}
envFrom:
{{- if .Values.configs }}
- configMapRef:
name: "{{ .Release.Name }}-configs"
{{- end }}
{{- if .Values.secrets}}
- secretRef:
name: "{{ .Release.Name }}-secrets"
{{- end }}
{{- include "chart.externalConfigs" . | nindent 12 }}
{{- end }}
ports:
- name: {{ include "chart.container-port-name" . }}
containerPort: {{ include "chart.container-port" . }}
{{- include "chart.extraPorts" . | nindent 12 }}
{{- include "chart.probes" . | indent 10 }}
resources: {{- include "chart.resources" . | nindent 12 }}
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
{{- with .Values.extraContainers }}
{{ toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.extraInit }}
initContainers:
- name: wait-download-model
image: {{ include "chart.extraInitImage" . }}
command:
- /bin/bash
args:
- -eucx
- while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done
env: {{- include "chart.extraInitEnv" . | nindent 10 }}
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
{{- end }}
volumes:
- name: {{ .Release.Name }}-storage
persistentVolumeClaim:
claimName: {{ .Release.Name }}-storage-claim
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if and (gt (int (index .Values.resources.requests "nvidia.com/gpu")) 0) (gt (int (index .Values.resources.limits "nvidia.com/gpu")) 0) }}
runtimeClassName: nvidia
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.product
operator: In
{{- with .Values.gpuModels }}
values:
{{- toYaml . | nindent 20 }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,31 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: "{{ .Release.Name }}-hpa"
namespace: {{ .Release.Namespace }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,37 @@
{{- if .Values.extraInit }}
apiVersion: batch/v1
kind: Job
metadata:
name: "{{ .Release.Name }}-init-vllm"
namespace: {{ .Release.Namespace }}
spec:
ttlSecondsAfterFinished: 100
template:
metadata:
name: init-vllm
spec:
containers:
- name: job-download-model
image: {{ include "chart.extraInitImage" . }}
command:
- /bin/bash
args:
- -eucx
- aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data
env: {{- include "chart.extraInitEnv" . | nindent 8 }}
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data
resources:
requests:
cpu: 200m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
restartPolicy: OnFailure
volumes:
- name: {{ .Release.Name }}-storage
persistentVolumeClaim:
claimName: "{{ .Release.Name }}-storage-claim"
{{- end }}

View File

@ -0,0 +1,7 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: "{{ .Release.Name }}-pdb"
namespace: {{ .Release.Namespace }}
spec:
maxUnavailable: {{ default 1 .Values.maxUnavailablePodDisruptionBudget }}

View File

@ -0,0 +1,13 @@
{{- if .Values.extraInit }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: "{{ .Release.Name }}-storage-claim"
namespace: {{ .Release.Namespace }}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.extraInit.pvcStorage }}
{{- end }}

View File

@ -0,0 +1,10 @@
apiVersion: v1
kind: Secret
metadata:
name: "{{ .Release.Name }}-secrets"
namespace: {{ .Release.Namespace }}
type: Opaque
data:
{{- range $key, $val := .Values.secrets }}
{{ $key }}: {{ $val | b64enc | quote }}
{{- end }}

View File

@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: "{{ .Release.Name }}-service"
namespace: {{ .Release.Namespace }}
spec:
type: ClusterIP
ports:
- name: {{ include "chart.service-port-name" . }}
port: {{ include "chart.service-port" . }}
targetPort: {{ include "chart.container-port-name" . }}
protocol: TCP
selector:
{{- include "chart.labels" . | nindent 4 }}

View File

@ -0,0 +1,265 @@
{
"$schema": "http://json-schema.org/schema#",
"type": "object",
"properties": {
"image": {
"type": "object",
"properties": {
"repository": {
"type": "string"
},
"tag": {
"type": "string"
},
"command": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"command",
"repository",
"tag"
]
},
"containerPort": {
"type": "integer"
},
"serviceName": {
"type": "null"
},
"servicePort": {
"type": "integer"
},
"extraPorts": {
"type": "array"
},
"replicaCount": {
"type": "integer"
},
"deploymentStrategy": {
"type": "object"
},
"resources": {
"type": "object",
"properties": {
"requests": {
"type": "object",
"properties": {
"cpu": {
"type": "integer"
},
"memory": {
"type": "string"
},
"nvidia.com/gpu": {
"type": "integer"
}
},
"required": [
"cpu",
"memory",
"nvidia.com/gpu"
]
},
"limits": {
"type": "object",
"properties": {
"cpu": {
"type": "integer"
},
"memory": {
"type": "string"
},
"nvidia.com/gpu": {
"type": "integer"
}
},
"required": [
"cpu",
"memory",
"nvidia.com/gpu"
]
}
},
"required": [
"limits",
"requests"
]
},
"gpuModels": {
"type": "array",
"items": {
"type": "string"
}
},
"autoscaling": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"minReplicas": {
"type": "integer"
},
"maxReplicas": {
"type": "integer"
},
"targetCPUUtilizationPercentage": {
"type": "integer"
}
},
"required": [
"enabled",
"maxReplicas",
"minReplicas",
"targetCPUUtilizationPercentage"
]
},
"configs": {
"type": "object"
},
"secrets": {
"type": "object"
},
"externalConfigs": {
"type": "array"
},
"customObjects": {
"type": "array"
},
"maxUnavailablePodDisruptionBudget": {
"type": "string"
},
"extraInit": {
"type": "object",
"properties": {
"s3modelpath": {
"type": "string"
},
"pvcStorage": {
"type": "string"
},
"awsEc2MetadataDisabled": {
"type": "boolean"
}
},
"required": [
"pvcStorage",
"s3modelpath",
"awsEc2MetadataDisabled"
]
},
"extraContainers": {
"type": "array"
},
"readinessProbe": {
"type": "object",
"properties": {
"initialDelaySeconds": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
},
"httpGet": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"port": {
"type": "integer"
}
},
"required": [
"path",
"port"
]
}
},
"required": [
"failureThreshold",
"httpGet",
"initialDelaySeconds",
"periodSeconds"
]
},
"livenessProbe": {
"type": "object",
"properties": {
"initialDelaySeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"httpGet": {
"type": "object",
"properties": {
"path": {
"type": "string"
},
"port": {
"type": "integer"
}
},
"required": [
"path",
"port"
]
}
},
"required": [
"failureThreshold",
"httpGet",
"initialDelaySeconds",
"periodSeconds"
]
},
"labels": {
"type": "object",
"properties": {
"environment": {
"type": "string"
},
"release": {
"type": "string"
}
},
"required": [
"environment",
"release"
]
}
},
"required": [
"autoscaling",
"configs",
"containerPort",
"customObjects",
"deploymentStrategy",
"externalConfigs",
"extraContainers",
"extraInit",
"extraPorts",
"gpuModels",
"image",
"labels",
"livenessProbe",
"maxUnavailablePodDisruptionBudget",
"readinessProbe",
"replicaCount",
"resources",
"secrets",
"servicePort"
]
}

View File

@ -0,0 +1,119 @@
# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.
# -- Image configuration
image:
# -- Image repository
repository: "vllm/vllm-openai"
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []
# -- Number of replicas
replicaCount: 1
# -- Deployment strategy configuration
deploymentStrategy: {}
# -- Resource configuration
resources:
requests:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
limits:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
# -- Type of gpu used
gpuModels:
- "TYPE_GPU_USED"
# -- Autoscaling configuration
autoscaling:
# -- Enable autoscaling
enabled: false
# -- Minimum replicas
minReplicas: 1
# -- Maximum replicas
maxReplicas: 100
# -- Target CPU utilization for autoscaling
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# -- Configmap
configs: {}
# -- Secrets configuration
secrets: {}
# -- External configuration
externalConfigs: []
# -- Custom Objects configuration
customObjects: []
# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""
# -- Additional configuration for the init container
extraInit:
# -- Path of the model on the s3 which hosts model weights and config files
s3modelpath: "relative_s3_model_path/opt-125m"
# -- Storage size of the s3
pvcStorage: "1Gi"
awsEc2MetadataDisabled: true
# -- Additional containers configuration
extraContainers: []
# -- Readiness probe configuration
readinessProbe:
# -- Number of seconds after the container has started before readiness probe is initiated
initialDelaySeconds: 5
# -- How often (in seconds) to perform the readiness probe
periodSeconds: 5
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
failureThreshold: 3
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
# -- Liveness probe configuration
livenessProbe:
# -- Number of seconds after the container has started before liveness probe is initiated
initialDelaySeconds: 15
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
failureThreshold: 3
# -- How often (in seconds) to perform the liveness probe
periodSeconds: 10
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
labels:
environment: "test"
release: "test"