vllm/examples/online_serving/chart-helm/values.yaml

# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.

# -- Image configuration
image:
  # -- Image repository
  repository: "vllm/vllm-openai"
  # -- Image tag
  tag: "latest"
  # -- Container launch command
  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]

# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []

# -- Number of replicas
replicaCount: 1

# -- Deployment strategy configuration
deploymentStrategy: {}

# -- Resource configuration
resources:
  requests:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1
  limits:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1

# -- Type of gpu used
gpuModels:
  - "TYPE_GPU_USED"

# -- Autoscaling configuration
autoscaling:
  # -- Enable autoscaling
  enabled: false
  # -- Minimum replicas
  minReplicas: 1
  # -- Maximum replicas
  maxReplicas: 100
  # -- Target CPU utilization for autoscaling
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80

# -- Configmap
configs: {}

# -- Secrets configuration
secrets: {}

# -- External configuration
externalConfigs: []

# -- Custom Objects configuration
customObjects: []

# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""

# -- Additional configuration for the init container
extraInit:
   # -- Path of the model on the s3 which hosts model weights and config files
  s3modelpath: "relative_s3_model_path/opt-125m"
   # -- Storage size of the s3
  pvcStorage: "1Gi"
  awsEc2MetadataDisabled: true

# -- Additional containers configuration
extraContainers: []

# -- Readiness probe configuration
readinessProbe:
  # -- Number of seconds after the container has started before readiness probe is initiated
  initialDelaySeconds: 5
  # -- How often (in seconds) to perform the readiness probe
  periodSeconds: 5
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
  failureThreshold: 3
   # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

# -- Liveness probe configuration
livenessProbe:
 # -- Number of seconds after the container has started before liveness probe is initiated
  initialDelaySeconds: 15
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
  failureThreshold: 3
  # -- How often (in seconds) to perform the liveness probe
  periodSeconds: 10
  # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

labels:
  environment: "test"
  release: "test"