120 lines
3.2 KiB
YAML
120 lines
3.2 KiB
YAML
# -- Default values for chart vllm
|
|
# -- Declare variables to be passed into your templates.
|
|
|
|
# -- Image configuration
|
|
image:
|
|
# -- Image repository
|
|
repository: "vllm/vllm-openai"
|
|
# -- Image tag
|
|
tag: "latest"
|
|
# -- Container launch command
|
|
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
|
# -- Container port
|
|
containerPort: 8000
|
|
# -- Service name
|
|
serviceName:
|
|
# -- Service port
|
|
servicePort: 80
|
|
# -- Additional ports configuration
|
|
extraPorts: []
|
|
|
|
# -- Number of replicas
|
|
replicaCount: 1
|
|
|
|
# -- Deployment strategy configuration
|
|
deploymentStrategy: {}
|
|
|
|
# -- Resource configuration
|
|
resources:
|
|
requests:
|
|
# -- Number of CPUs
|
|
cpu: 4
|
|
# -- CPU memory configuration
|
|
memory: 16Gi
|
|
# -- Number of gpus used
|
|
nvidia.com/gpu: 1
|
|
limits:
|
|
# -- Number of CPUs
|
|
cpu: 4
|
|
# -- CPU memory configuration
|
|
memory: 16Gi
|
|
# -- Number of gpus used
|
|
nvidia.com/gpu: 1
|
|
|
|
# -- Type of gpu used
|
|
gpuModels:
|
|
- "TYPE_GPU_USED"
|
|
|
|
# -- Autoscaling configuration
|
|
autoscaling:
|
|
# -- Enable autoscaling
|
|
enabled: false
|
|
# -- Minimum replicas
|
|
minReplicas: 1
|
|
# -- Maximum replicas
|
|
maxReplicas: 100
|
|
# -- Target CPU utilization for autoscaling
|
|
targetCPUUtilizationPercentage: 80
|
|
# targetMemoryUtilizationPercentage: 80
|
|
|
|
# -- Configmap
|
|
configs: {}
|
|
|
|
# -- Secrets configuration
|
|
secrets: {}
|
|
|
|
# -- External configuration
|
|
externalConfigs: []
|
|
|
|
# -- Custom Objects configuration
|
|
customObjects: []
|
|
|
|
# -- Disruption Budget Configuration
|
|
maxUnavailablePodDisruptionBudget: ""
|
|
|
|
# -- Additional configuration for the init container
|
|
extraInit:
|
|
# -- Path of the model on the s3 which hosts model weights and config files
|
|
s3modelpath: "relative_s3_model_path/opt-125m"
|
|
# -- Storage size of the s3
|
|
pvcStorage: "1Gi"
|
|
awsEc2MetadataDisabled: true
|
|
|
|
# -- Additional containers configuration
|
|
extraContainers: []
|
|
|
|
# -- Readiness probe configuration
|
|
readinessProbe:
|
|
# -- Number of seconds after the container has started before readiness probe is initiated
|
|
initialDelaySeconds: 5
|
|
# -- How often (in seconds) to perform the readiness probe
|
|
periodSeconds: 5
|
|
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
|
|
failureThreshold: 3
|
|
# -- Configuration of the Kubelet http request on the server
|
|
httpGet:
|
|
# -- Path to access on the HTTP server
|
|
path: /health
|
|
# -- Name or number of the port to access on the container, on which the server is listening
|
|
port: 8000
|
|
|
|
# -- Liveness probe configuration
|
|
livenessProbe:
|
|
# -- Number of seconds after the container has started before liveness probe is initiated
|
|
initialDelaySeconds: 15
|
|
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
|
|
failureThreshold: 3
|
|
# -- How often (in seconds) to perform the liveness probe
|
|
periodSeconds: 10
|
|
# -- Configuration of the Kubelet http request on the server
|
|
httpGet:
|
|
# -- Path to access on the HTTP server
|
|
path: /health
|
|
# -- Name or number of the port to access on the container, on which the server is listening
|
|
port: 8000
|
|
|
|
labels:
|
|
environment: "test"
|
|
release: "test"
|