From 42bc386129f6890aa1654c31aa17a415f7642a5e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 24 Mar 2024 17:04:00 -0700
Subject: [PATCH] [CI/Build] respect the common environment variable MAX_JOBS
 (#3600)

---
 docs/source/getting_started/installation.rst |  9 +++++++++
 setup.py                                     | 19 +++++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 77b0ae65..3355a894 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -60,6 +60,15 @@ You can also build and install vLLM from source:
     $ cd vllm
     $ pip install -e .  # This may take 5-10 minutes.
 
+.. tip::
+    To avoid your system being overloaded, you can limit the number of compilation jobs
+    to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+    .. code-block:: console
+
+        $ export MAX_JOBS=6
+        $ pip install -e .
+
 .. tip::
     If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
 
diff --git a/setup.py b/setup.py
index 47cac599..27106b1f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 import io
 import os
 import re
+import logging
 import subprocess
 import sys
 from typing import List
@@ -13,6 +14,7 @@ import torch
 from torch.utils.cpp_extension import CUDA_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
+logger = logging.getLogger(__name__)
 
 # vLLM only supports Linux platform
 assert sys.platform.startswith(
@@ -54,12 +56,17 @@ class cmake_build_ext(build_ext):
     # Determine number of compilation jobs and optionally nvcc compile threads.
     #
     def compute_num_jobs(self):
-        try:
-            # os.sched_getaffinity() isn't universally available, so fall back
-            # to os.cpu_count() if we get an error here.
-            num_jobs = len(os.sched_getaffinity(0))
-        except AttributeError:
-            num_jobs = os.cpu_count()
+        num_jobs = os.environ.get("MAX_JOBS", None)
+        if num_jobs is not None:
+            num_jobs = int(num_jobs)
+            logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
+        else:
+            try:
+                # os.sched_getaffinity() isn't universally available, so fall
+                #  back to os.cpu_count() if we get an error here.
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
 
         nvcc_threads = None
         if _is_cuda():