diff --git a/.gitignore b/.gitignore index b531b791..58572718 100644 --- a/.gitignore +++ b/.gitignore @@ -177,3 +177,7 @@ _build/ # vim swap files *.swo *.swp + +# hip files generated by PyTorch +*.hip +*_hip* diff --git a/Dockerfile.rocm b/Dockerfile.rocm new file mode 100644 index 00000000..7af53e44 --- /dev/null +++ b/Dockerfile.rocm @@ -0,0 +1,62 @@ +FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1 + +# Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y + +# Install some basic utilities +RUN apt-get update && apt-get install -y \ + curl \ + ca-certificates \ + sudo \ + git \ + bzip2 \ + libx11-6 \ + build-essential \ + wget \ + unzip \ + nvidia-cuda-toolkit \ + tmux \ + && rm -rf /var/lib/apt/lists/* + +### Mount Point ### +# When launching the container, mount the code directory to /app +ARG APP_MOUNT=/app +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT} + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas + +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: +ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: + +# Install ROCm flash-attention +RUN mkdir libs \ + && cd libs \ + && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \ + && cd flash-attention \ + && git checkout 3d2b6f5 \ + && git submodule update --init \ + && export GPU_ARCHS=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) \ + && patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \ + && python3 setup.py install \ + && cd .. + +COPY ./ /app/vllm + +RUN python3 -m pip install --upgrade pip +RUN pip install xformers==0.0.22.post7 --no-deps + +RUN cd /app \ + && cd vllm \ + && pip install -U -r requirements-rocm.txt \ + && bash patch_xformers-0.0.22.post7.rocm.sh \ + && python3 setup.py install \ + && cd .. + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir ray[all] + +CMD ["/bin/bash"] diff --git a/README.md b/README.md index 9cc325e9..477e6148 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone --- *Latest News* 🔥 +- [2023/12] Added ROCm support to vLLM. - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there. - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv! @@ -43,6 +44,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server +- Support NVIDIA CUDA and AMD ROCm. vLLM seamlessly supports many Hugging Face models, including the following architectures: diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 89d1ba2d..1cca2c5f 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -1,6 +1,7 @@ #include #include +#include "cuda_compat.h" #include "dispatch_utils.h" namespace vllm { @@ -18,8 +19,8 @@ __global__ void silu_and_mul_kernel( const int d) { const int64_t token_idx = blockIdx.x; for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]); - const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]); + const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); + const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); out[token_idx * d + idx] = silu(x) * y; } } @@ -57,7 +58,7 @@ __global__ void activation_kernel( const int d) { const int64_t token_idx = blockIdx.x; for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = __ldg(&input[token_idx * d + idx]); + const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); out[token_idx * d + idx] = ACT_FN(x); } } diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 78e8d8ec..3676af1a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -15,6 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#ifdef USE_ROCM +#include +#endif + #include #include @@ -23,7 +27,11 @@ #include +#ifndef USE_ROCM #define WARP_SIZE 32 +#else +#define WARP_SIZE warpSize +#endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) @@ -40,7 +48,7 @@ inline __device__ float block_sum(float* red_smem, float sum) { // Compute the sum per warp. #pragma unroll for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += VLLM_SHFL_XOR_SYNC(sum, mask); } // Warp leaders store the data to shared memory. @@ -59,11 +67,11 @@ inline __device__ float block_sum(float* red_smem, float sum) { // Parallel reduction inside the warp. #pragma unroll for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += VLLM_SHFL_XOR_SYNC(sum, mask); } // Broadcast to other threads. - return __shfl_sync(uint32_t(-1), sum, 0); + return VLLM_SHFL_SYNC(sum, 0); } // TODO(woosuk): Merge the last two dimensions of the grid. @@ -223,7 +231,7 @@ __device__ void paged_attention_kernel( // The 0-th thread of each thread group already has its max qk value. #pragma unroll for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); } if (lane == 0) { red_smem[warp_idx] = qk_max; @@ -235,10 +243,10 @@ __device__ void paged_attention_kernel( qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; #pragma unroll for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); } // Broadcast the max qk value to all threads. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + qk_max = VLLM_SHFL_SYNC(qk_max, 0); // Get the sum of the exp values. float exp_sum = 0.f; @@ -326,7 +334,7 @@ __device__ void paged_attention_kernel( float acc = accs[i]; #pragma unroll for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += __shfl_xor_sync(uint32_t(-1), acc, mask); + acc += VLLM_SHFL_XOR_SYNC(acc, mask); } accs[i] = acc; } @@ -492,7 +500,7 @@ __global__ void paged_attention_v2_reduce_kernel( // Reduce within the warp. #pragma unroll for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask)); + max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask)); } if (lane == 0) { red_smem[warp_idx] = max_logit; @@ -502,10 +510,10 @@ __global__ void paged_attention_v2_reduce_kernel( max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; #pragma unroll for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask)); + max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask)); } // Broadcast the max value to all threads. - max_logit = __shfl_sync(uint32_t(-1), max_logit, 0); + max_logit = VLLM_SHFL_SYNC(max_logit, 0); // Load rescaled exp sums to shared memory. float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions); @@ -539,9 +547,9 @@ __global__ void paged_attention_v2_reduce_kernel( } // namespace vllm #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - cudaFuncSetAttribute( \ - vllm::paged_attention_v1_kernel, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size); \ + VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ + ((void*)vllm::paged_attention_v1_kernel), \ + shared_mem_size); \ vllm::paged_attention_v1_kernel \ <<>>( \ out_ptr, \ diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh index bb7df25b..ff64c4bd 100644 --- a/csrc/attention/attention_utils.cuh +++ b/csrc/attention/attention_utils.cuh @@ -17,6 +17,7 @@ */ #pragma once +#include "../cuda_compat.h" #include "attention_dtypes.h" #include @@ -39,7 +40,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { float qk = sum(qk_vec); #pragma unroll for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { - qk += __shfl_xor_sync(uint32_t(-1), qk, mask); + qk += VLLM_SHFL_XOR_SYNC(qk, mask); } return qk; } diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh index 5786f77f..31e0cee0 100644 --- a/csrc/attention/dtype_bfloat16.cuh +++ b/csrc/attention/dtype_bfloat16.cuh @@ -21,8 +21,17 @@ #include "attention_generic.cuh" #include "dtype_float32.cuh" -#include -#include +#ifndef USE_ROCM + #include + #include +#else + #include + #include + + typedef __hip_bfloat162 __nv_bfloat162; + typedef __hip_bfloat16 __nv_bfloat16; +#endif + #include namespace vllm { @@ -98,7 +107,11 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 assert(false); #else - return a + b; + #ifndef USE_ROCM + return a + b; + #else + return __hadd(a, b); + #endif #endif } diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh index e6792112..d3271e69 100644 --- a/csrc/attention/dtype_float16.cuh +++ b/csrc/attention/dtype_float16.cuh @@ -21,6 +21,10 @@ #include "attention_generic.cuh" #include "dtype_float32.cuh" +#ifdef USE_ROCM + #include +#endif + #include namespace vllm { @@ -63,21 +67,47 @@ struct FloatVec { // Utility functions for type conversions. inline __device__ uint32_t h0_h0(uint16_t a) { +#ifndef USE_ROCM uint32_t b; asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a)); return b; +#else + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; + tmp.u16[0] = a; + tmp.u16[1] = a; + return tmp.u32; +#endif } inline __device__ float half_to_float(uint16_t h) { float f; +#ifndef USE_ROCM asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h)); +#else + asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h)); +#endif return f; } inline __device__ float2 half2_to_float2(uint32_t v) { +#ifndef USE_ROCM uint16_t lo, hi; asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); return make_float2(half_to_float(lo), half_to_float(hi)); +#else + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; + tmp.u32 = v; + float2 ret; + ret.x = half_to_float(tmp.u16[0]); + ret.y = half_to_float(tmp.u16[1]); + return ret; +#endif } inline __device__ uint16_t float_to_half(float f) { @@ -85,7 +115,11 @@ inline __device__ uint16_t float_to_half(float f) { uint32_t u32; uint16_t u16[2]; } tmp; +#ifndef USE_ROCM asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f)); +#else + asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f)); +#endif return tmp.u16[0]; } @@ -94,12 +128,16 @@ inline __device__ uint32_t float2_to_half2(float2 f) { uint32_t u32; uint16_t u16[2]; } tmp; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); +#ifndef USE_ROCM + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); + #else + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); + #endif #else - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); + tmp.u16[0] = float_to_half(f.x); + tmp.u16[1] = float_to_half(f.y); #endif return tmp.u32; } @@ -107,13 +145,21 @@ inline __device__ uint32_t float2_to_half2(float2 f) { // Vector addition. inline __device__ uint16_t add(uint16_t a, uint16_t b) { uint16_t c; +#ifndef USE_ROCM asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); +#else + asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); +#endif return c; } inline __device__ uint32_t add(uint32_t a, uint32_t b) { uint32_t c; +#ifndef USE_ROCM asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); +#else + asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); +#endif return c; } @@ -158,14 +204,22 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) { template<> inline __device__ uint16_t mul(uint16_t a, uint16_t b) { uint16_t c; +#ifndef USE_ROCM asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); +#else + asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); +#endif return c; } template<> inline __device__ uint32_t mul(uint32_t a, uint32_t b) { uint32_t c; +#ifndef USE_ROCM asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); +#else + asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); +#endif return c; } @@ -272,7 +326,11 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) { // Vector fused multiply-add. inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) { uint32_t d; +#ifndef USE_ROCM asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c)); +#else + asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c)); +#endif return d; } diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 3ad52b16..59bacffd 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -1,6 +1,7 @@ #include #include +#include "cuda_compat.h" #include "dispatch_utils.h" #include @@ -28,8 +29,8 @@ void swap_blocks( TORCH_CHECK(false, "Invalid device combination"); } - void *src_ptr = src.data_ptr(); - void *dst_ptr = dst.data_ptr(); + char *src_ptr = static_cast(src.data_ptr()); + char *dst_ptr = static_cast(dst.data_ptr()); const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -267,8 +268,8 @@ __global__ void gather_cached_kv_kernel( + head_offset * block_size + block_offset; - key[tgt_key_idx] = __ldg(&key_cache[src_key_idx]); - value[tgt_value_idx] = __ldg(&value_cache[src_value_idx]); + key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]); + value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]); } } @@ -333,8 +334,8 @@ __global__ void gather_cached_kv_kernel_optimized( src_key_indices[j] = src_key_idx; src_value_indices[j] = src_value_idx; - keys_to_store[j] = __ldg(&key_cache[src_key_idx]); - values_to_store[j] = __ldg(&value_cache[src_value_idx]); + keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]); + values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]); } #pragma unroll diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h new file mode 100644 index 00000000..aa58dd73 --- /dev/null +++ b/csrc/cuda_compat.h @@ -0,0 +1,28 @@ +#pragma once + +#ifndef USE_ROCM + #define VLLM_LDG(arg) __ldg(arg) +#else + #define VLLM_LDG(arg) *(arg) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) +#else + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) +#else + #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) +#endif + +#ifndef USE_ROCM + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) +#else + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) +#endif + diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu index f1c30fe7..6c844a7f 100644 --- a/csrc/cuda_utils_kernels.cu +++ b/csrc/cuda_utils_kernels.cu @@ -1,3 +1,6 @@ +#ifdef USE_ROCM + #include +#endif int get_device_attribute( int attribute, int device_id) diff --git a/csrc/ops.h b/csrc/ops.h index cfb18fbe..e12c34f0 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -61,12 +61,14 @@ void gelu_fast( torch::Tensor& out, torch::Tensor& input); +#ifndef USE_ROCM torch::Tensor awq_gemm( torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters); +#endif void squeezellm_gemm( torch::Tensor vec, diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 0a5ec95f..e1dc7117 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -1,6 +1,7 @@ #include #include +#include "cuda_compat.h" #include "dispatch_utils.h" namespace vllm { @@ -19,14 +20,14 @@ inline __device__ void apply_rotary_embedding( // GPT-NeoX style rotary embedding. x_index = rot_offset; y_index = embed_dim + rot_offset; - cos = __ldg(cos_ptr + x_index); - sin = __ldg(sin_ptr + x_index); + cos = VLLM_LDG(cos_ptr + x_index); + sin = VLLM_LDG(sin_ptr + x_index); } else { // GPT-J style rotary embedding. x_index = 2 * rot_offset; y_index = 2 * rot_offset + 1; - cos = __ldg(cos_ptr + x_index / 2); - sin = __ldg(sin_ptr + x_index / 2); + cos = VLLM_LDG(cos_ptr + x_index / 2); + sin = VLLM_LDG(sin_ptr + x_index / 2); } const scalar_t x = arr[x_index]; diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 9e314296..b0120e96 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -48,8 +48,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &rotary_embedding, "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); +#ifndef USE_ROCM // Quantization ops ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); +#endif + + ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); // Cache ops diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu index 1392b877..2c37d01e 100644 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu @@ -20,9 +20,17 @@ __device__ inline unsigned int as_unsigned(int i) { // 4-bit matvec kernel (LUT-based) __global__ void NUQ4MatMulKernel( +#ifndef USE_ROCM const half2* __restrict__ vec, +#else + const __half2* __restrict__ vec, +#endif const int* __restrict__ mat, +#ifndef USE_ROCM half2* __restrict__ mul, +#else + float2* __restrict__ mul, +#endif const __half* __restrict__ lookup_table, int height, int width, @@ -35,7 +43,11 @@ __global__ void NUQ4MatMulKernel( int row = BLOCKHEIGHT4 * blockIdx.x; int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; +#ifndef USE_ROCM __shared__ half2 blockvec[blockwidth2]; +#else + __shared__ __half2 blockvec[blockwidth2]; +#endif __shared__ __half deq2[16][BLOCKWIDTH]; int off = threadIdx.x; @@ -46,8 +58,13 @@ __global__ void NUQ4MatMulKernel( } __half res; +#ifndef USE_ROCM half2 res2; half2 tmp2; +#else + __half2 res2; + __half2 tmp2; +#endif int i; int k; @@ -68,48 +85,96 @@ __global__ void NUQ4MatMulKernel( while (k < blockwidth2) { tmp1 = as_unsigned(mat[i]); +#ifndef USE_ROCM res2 = {}; tmp2 = {}; +#else + res2.x = __half_as_ushort(__float2half(0)); + res2.y = __half_as_ushort(__float2half(0)); + tmp2.x = __half_as_ushort(__float2half(0)); + tmp2.y = __half_as_ushort(__float2half(0)); +#endif lut_index1 = tmp1 & 0xF; lut_index2 = (tmp1 >> 4) & 0xF; +#ifndef USE_ROCM tmp2.x = deq2[lut_index1][off]; tmp2.y = deq2[lut_index2][off]; +#else + tmp2.x = __half_as_ushort(deq2[lut_index1][off]); + tmp2.y = __half_as_ushort(deq2[lut_index2][off]); +#endif res2 = __hfma2(tmp2, blockvec[k + 0], res2); lut_index1 = (tmp1 >> 8) & 0xF; lut_index2 = (tmp1 >> 12) & 0xF; +#ifndef USE_ROCM tmp2.x = deq2[lut_index1][off]; tmp2.y = deq2[lut_index2][off]; +#else + tmp2.x = __half_as_ushort(deq2[lut_index1][off]); + tmp2.y = __half_as_ushort(deq2[lut_index2][off]); +#endif res2 = __hfma2(tmp2, blockvec[k + 1], res2); lut_index1 = (tmp1 >> 16) & 0xF; lut_index2 = (tmp1 >> 20) & 0xF; +#ifndef USE_ROCM tmp2.x = deq2[lut_index1][off]; tmp2.y = deq2[lut_index2][off]; +#else + tmp2.x = __half_as_ushort(deq2[lut_index1][off]); + tmp2.y = __half_as_ushort(deq2[lut_index2][off]); +#endif res2 = __hfma2(tmp2, blockvec[k + 2], res2); lut_index1 = (tmp1 >> 24) & 0xF; lut_index2 = (tmp1 >> 28) & 0xF; +#ifndef USE_ROCM tmp2.x = deq2[lut_index1][off]; tmp2.y = deq2[lut_index2][off]; +#else + tmp2.x = __half_as_ushort(deq2[lut_index1][off]); + tmp2.y = __half_as_ushort(deq2[lut_index2][off]); +#endif res2 = __hfma2(tmp2, blockvec[k + 3], res2); +#ifndef USE_ROCM res = __hadd(__hadd(res2.x, res2.y), res); +#else + res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), res); +#endif i += width; k += 4; } // col%2 -> only set one of the two values +#ifndef USE_ROCM half2 res3 = {}; if (col % 2 == 0) { res3.x = res; } else { res3.y = res; } +#else + __half2 res3; + res3.x = __half_as_ushort(__float2half(0)); + res3.y = __half_as_ushort(__float2half(0)); + if (col % 2 == 0) { + res3.x = __half_as_ushort(res); + } else { + res3.y = __half_as_ushort(res); + } +#endif +#ifndef USE_ROCM atomicAdd(&mul[b * width / 2 + col / 2], res3); +#else + int tmp_addr = b * width / 2 + col / 2; + atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x))); + atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y))); +#endif } } @@ -136,10 +201,19 @@ void squeezellm_gemm( dim3 threads(BLOCKWIDTH); vllm::squeezellm::NUQ4MatMulKernel<<>>( +#ifndef USE_ROCM (half2*) vec.data(), +#else + (__half2*) vec.data_ptr(), +#endif mat.data_ptr(), +#ifndef USE_ROCM (half2*) mul.data(), (__half*) lookup_table.data(), +#else + (float2*) mul.data_ptr(), + (__half*) lookup_table.data_ptr(), +#endif height, width, batch, vec_height ); } diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index bc35aa04..b95ccef1 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -17,13 +17,15 @@ */ #pragma once +#include "cuda_compat.h" + namespace vllm { template __inline__ __device__ T warpReduceSum(T val) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) - val += __shfl_xor_sync(0xffffffff, val, mask, 32); + val += VLLM_SHFL_XOR_SYNC(val, mask); return val; } diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst new file mode 100644 index 00000000..512aa051 --- /dev/null +++ b/docs/source/getting_started/amd-installation.rst @@ -0,0 +1,143 @@ +.. _installation_rocm: + +Installation with ROCm +====================== + +vLLM 0.2.x onwards supports model inferencing and serving on AMD GPUs with ROCm. +At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported. +Data types currently supported in ROCm are FP16 and BF16. + +Requirements +------------ + +* OS: Linux +* Python: 3.8 -- 3.11 (Verified on 3.10) +* GPU: MI200s +* Pytorch 2.0.1/2.1.1/2.2 +* ROCm 5.7 + +Installation options: + +#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image ` +#. :ref:`Build from source ` +#. :ref:`Build from source with docker ` + +.. _quick_start_docker_rocm: + +(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image +--------------------------------------------------------------------------- + +.. code-block:: console + + $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.3 + $ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + embeddedllminfo/vllm-rocm \ + bash + + +.. _build_from_source_rocm: + +Option 2: Build from source +--------------------------- + +You can build and install vLLM from source: + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- `ROCm `_ +- `Pytorch `_ + + .. code-block:: console + + $ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version + + +1. Install `flash attention for ROCm `_ + + Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention `_ + +.. note:: + - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. + - If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. + - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + +2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention + + .. code-block:: console + + $ pip install xformers==0.0.22.post7 --no-deps + $ bash patch_xformers-0.0.22.post7.rocm.sh + +3. Build vLLM. + + .. code-block:: console + + $ cd vllm + $ pip install -U -r requirements-rocm.txt + $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation + + +.. _build_from_source_docker_rocm: + +Option 3: Build from source with docker +----------------------------------------------------- + +You can build and install vLLM from source: + +Build a docker image from `Dockerfile.rocm`, and launch a docker container. + +.. code-block:: console + + $ docker build -f Dockerfile.rocm -t vllm-rocm . + $ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash + +Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below: + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- `ROCm `_ +- `Pytorch `_ + +1. Install `flash attention for ROCm `_ + + Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention `_ + +.. note:: + - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. + - If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. + - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + +2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention + + .. code-block:: console + + $ pip install xformers==0.0.22.post7 --no-deps + $ bash patch_xformers-0.0.22.post7.rocm.sh + +3. Build vLLM. + + .. code-block:: console + + $ cd vllm + $ pip install -U -r requirements-rocm.txt + $ python setup.py install # This may take 5-10 minutes. diff --git a/docs/source/index.rst b/docs/source/index.rst index 300c2276..04af0907 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,6 +39,7 @@ vLLM is flexible and easy to use with: * Tensor parallelism support for distributed inference * Streaming outputs * OpenAI-compatible API server +* Support NVIDIA CUDA and AMD ROCm. For more information, check out the following: @@ -56,6 +57,7 @@ Documentation :caption: Getting Started getting_started/installation + getting_started/amd-installation getting_started/quickstart .. toctree:: diff --git a/patch_xformers-0.0.22.post7.rocm.sh b/patch_xformers-0.0.22.post7.rocm.sh new file mode 100644 index 00000000..8dcb6240 --- /dev/null +++ b/patch_xformers-0.0.22.post7.rocm.sh @@ -0,0 +1,22 @@ +#!/bin/bash +export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)') +export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)') + +echo $XFORMERS_FMHA_FLASH_PATH +echo $XFORMERS_FMHA_COMMON_PATH + +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then + echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" + patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch" + echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" +else + echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" +fi + +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then + echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" + patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch" + echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" +else + echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" +fi diff --git a/requirements-rocm.txt b/requirements-rocm.txt new file mode 100644 index 00000000..42ab9d14 --- /dev/null +++ b/requirements-rocm.txt @@ -0,0 +1,17 @@ +ninja # For faster builds. +typing-extensions>=4.8.0 +starlette +psutil +ray >= 2.5.1 +pandas # Required for Ray data. +pyarrow # Required for Ray data. +sentencepiece # Required for LLaMA tokenizer. +numpy +tokenizers>=0.15.0 +huggingface_hub<0.18,>=0.16.4 +einops # Required for phi-1_5 +transformers >= 4.34.0 # Required for Mistral. +fastapi +uvicorn[standard] +pydantic == 1.10.13 # Required for OpenAI server. +aioprometheus[starlette] diff --git a/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch new file mode 100644 index 00000000..4d7495cf --- /dev/null +++ b/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch @@ -0,0 +1,13 @@ +--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000 ++++ common.py 2023-11-28 16:14:19.846233146 +0000 +@@ -298,8 +298,8 @@ + dtype = d.query.dtype + if device_type not in cls.SUPPORTED_DEVICES: + reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})") +- if device_type == "cuda" and not _built_with_cuda: +- reasons.append("xFormers wasn't build with CUDA support") ++ #if device_type == "cuda" and not _built_with_cuda: ++ # reasons.append("xFormers wasn't build with CUDA support") + if device_type == "cuda": + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: diff --git a/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch b/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch new file mode 100644 index 00000000..4798f1ef --- /dev/null +++ b/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch @@ -0,0 +1,134 @@ +--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py 2023-11-29 03:17:03.930103539 +0000 ++++ flash.py 2023-11-28 16:14:25.206128903 +0000 +@@ -31,39 +31,39 @@ + + FLASH_VERSION = "0.0.0" + try: +- try: +- from ... import _C_flashattention # type: ignore[attr-defined] +- from ..._cpp_lib import _build_metadata +- +- if _build_metadata is not None: +- FLASH_VERSION = _build_metadata.flash_version +- except ImportError: +- import flash_attn +- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention +- +- FLASH_VERSION = flash_attn.__version__ +- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) +- if flash_ver_parsed < (2, 3): +- raise ImportError("Requires 2.3 for sliding window support") ++ #try: ++ # from ... import _C_flashattention # type: ignore[attr-defined] ++ # from ..._cpp_lib import _build_metadata ++ ++ # if _build_metadata is not None: ++ # FLASH_VERSION = _build_metadata.flash_version ++ #except ImportError: ++ import flash_attn ++ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention ++ ++ FLASH_VERSION = flash_attn.__version__ ++ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) ++ # if flash_ver_parsed < (2, 3): ++ # raise ImportError("Requires 2.3 for sliding window support") + + # create library so that flash-attn goes through the PyTorch Dispatcher +- _flash_lib = torch.library.Library("xformers_flash", "DEF") ++ #_flash_lib = torch.library.Library("xformers_flash", "DEF") + +- _flash_lib.define( +- "flash_fwd(Tensor query, Tensor key, Tensor value, " +- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " +- "int max_seqlen_q, int max_seqlen_k, " +- "float p, float softmax_scale, " +- "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" +- ) +- +- _flash_lib.define( +- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " +- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " +- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " +- "int max_seqlen_q, int max_seqlen_k, " +- "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" +- ) ++ #_flash_lib.define( ++ # "flash_fwd(Tensor query, Tensor key, Tensor value, " ++ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " ++ # "int max_seqlen_q, int max_seqlen_k, " ++ # "float p, float softmax_scale, " ++ # "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" ++ #) ++ ++ #_flash_lib.define( ++ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " ++ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " ++ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " ++ # "int max_seqlen_q, int max_seqlen_k, " ++ # "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" ++ #) + + def _flash_fwd( + query, +@@ -98,8 +98,8 @@ + p, + softmax_scale, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + return_softmax, + None, # rng + ) +@@ -127,8 +127,8 @@ + softmax_scale, + False, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + return_softmax, + None, + ) +@@ -169,8 +169,8 @@ + p, + softmax_scale, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + None, + rng_state, + ) +@@ -193,15 +193,15 @@ + softmax_scale, + False, # zero_tensors + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + None, + rng_state, + ) + return dq, dk, dv + +- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") +- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") ++ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") ++ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") + except ImportError: + pass + +@@ -348,7 +348,7 @@ + implementation. + """ + +- OPERATOR = get_operator("xformers_flash", "flash_fwd") ++ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd") + SUPPORTED_DEVICES: Set[str] = {"cuda"} + CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0) + SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16} diff --git a/setup.py b/setup.py index 09e36a0c..2edf0b6a 100644 --- a/setup.py +++ b/setup.py @@ -8,27 +8,83 @@ import warnings from packaging.version import parse, Version import setuptools import torch -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME +from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) MAIN_CUDA_VERSION = "12.1" # Supported NVIDIA GPU architectures. -SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} +NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} +# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) + + +def _is_hip() -> bool: + return torch.version.hip is not None + + +def _is_cuda() -> bool: + return torch.version.cuda is not None + # Compiler flags. CXX_FLAGS = ["-g", "-O2", "-std=c++17"] # TODO(woosuk): Should we use -O3? NVCC_FLAGS = ["-O2", "-std=c++17"] +if _is_hip(): + if ROCM_HOME is None: + raise RuntimeError( + "Cannot find ROCM_HOME. ROCm must be available to build the package." + ) + NVCC_FLAGS += ["-DUSE_ROCM"] + +if _is_cuda() and CUDA_HOME is None: + raise RuntimeError( + "Cannot find CUDA_HOME. CUDA must be available to build the package.") + ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -if CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") + +def get_amdgpu_offload_arch(): + command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" + try: + output = subprocess.check_output([command]) + return output.decode('utf-8').strip() + except subprocess.CalledProcessError as e: + error_message = f"Error: {e}" + raise RuntimeError(error_message) from e + except FileNotFoundError as e: + # If the command is not found, print an error message + error_message = f"The command {command} was not found." + raise RuntimeError(error_message) from e + + return None + + +def get_hipcc_rocm_version(): + # Run the hipcc --version command + result = subprocess.run(['hipcc', '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True) + + # Check if the command was executed successfully + if result.returncode != 0: + print("Error running 'hipcc --version'") + return None + + # Extract the version using a regular expression + match = re.search(r'HIP version: (\S+)', result.stdout) + if match: + # Return the version string + return match.group(1) + else: + print("Could not find HIP version in the output") + return None def get_nvcc_cuda_version(cuda_dir: str) -> Version: @@ -61,20 +117,22 @@ def get_torch_arch_list() -> Set[str]: return set() # Filter out the invalid architectures and print a warning. - valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in SUPPORTED_ARCHS}) + valid_archs = NVIDIA_SUPPORTED_ARCHS.union( + {s + "+PTX" + for s in NVIDIA_SUPPORTED_ARCHS}) arch_list = torch_arch_list.intersection(valid_archs) # If none of the specified architectures are valid, raise an error. if not arch_list: raise RuntimeError( - "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env " + "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " f"variable ({env_arch_list}) is supported. " - f"Supported CUDA architectures are: {valid_archs}.") + f"Supported CUDA/ROCM architectures are: {valid_archs}.") invalid_arch_list = torch_arch_list - valid_archs if invalid_arch_list: warnings.warn( - f"Unsupported CUDA architectures ({invalid_arch_list}) are " + f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " - f"({env_arch_list}). Supported CUDA architectures are: " + f"({env_arch_list}). Supported CUDA/ROCM architectures are: " f"{valid_archs}.", stacklevel=2) return arch_list @@ -82,7 +140,7 @@ def get_torch_arch_list() -> Set[str]: # First, check the TORCH_CUDA_ARCH_LIST environment variable. compute_capabilities = get_torch_arch_list() -if not compute_capabilities: +if _is_cuda() and not compute_capabilities: # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available # GPUs on the current machine. device_count = torch.cuda.device_count() @@ -93,69 +151,84 @@ if not compute_capabilities: "GPUs with compute capability below 7.0 are not supported.") compute_capabilities.add(f"{major}.{minor}") -nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) -if not compute_capabilities: - # If no GPU is specified nor available, add all supported architectures - # based on the NVCC CUDA version. - compute_capabilities = SUPPORTED_ARCHS.copy() - if nvcc_cuda_version < Version("11.1"): - compute_capabilities.remove("8.6") - if nvcc_cuda_version < Version("11.8"): - compute_capabilities.remove("8.9") - compute_capabilities.remove("9.0") - -# Validate the NVCC CUDA version. -if nvcc_cuda_version < Version("11.0"): - raise RuntimeError("CUDA 11.0 or higher is required to build the package.") -if (nvcc_cuda_version < Version("11.1") - and any(cc.startswith("8.6") for cc in compute_capabilities)): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") -if nvcc_cuda_version < Version("11.8"): - if any(cc.startswith("8.9") for cc in compute_capabilities): - # CUDA 11.8 is required to generate the code targeting compute capability 8.9. - # However, GPUs with compute capability 8.9 can also run the code generated by - # the previous versions of CUDA 11 and targeting compute capability 8.0. - # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 - # instead of 8.9. - warnings.warn( - "CUDA 11.8 or higher is required for compute capability 8.9. " - "Targeting compute capability 8.0 instead.", - stacklevel=2) - compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) - compute_capabilities.add("8.0+PTX") - if any(cc.startswith("9.0") for cc in compute_capabilities): +if _is_cuda(): + nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) + if not compute_capabilities: + # If no GPU is specified nor available, add all supported architectures + # based on the NVCC CUDA version. + compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() + if nvcc_cuda_version < Version("11.1"): + compute_capabilities.remove("8.6") + if nvcc_cuda_version < Version("11.8"): + compute_capabilities.remove("8.9") + compute_capabilities.remove("9.0") + # Validate the NVCC CUDA version. + if nvcc_cuda_version < Version("11.0"): raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0.") + "CUDA 11.0 or higher is required to build the package.") + if (nvcc_cuda_version < Version("11.1") + and any(cc.startswith("8.6") for cc in compute_capabilities)): + raise RuntimeError( + "CUDA 11.1 or higher is required for compute capability 8.6.") + if nvcc_cuda_version < Version("11.8"): + if any(cc.startswith("8.9") for cc in compute_capabilities): + # CUDA 11.8 is required to generate the code targeting compute capability 8.9. + # However, GPUs with compute capability 8.9 can also run the code generated by + # the previous versions of CUDA 11 and targeting compute capability 8.0. + # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 + # instead of 8.9. + warnings.warn( + "CUDA 11.8 or higher is required for compute capability 8.9. " + "Targeting compute capability 8.0 instead.", + stacklevel=2) + compute_capabilities = set(cc for cc in compute_capabilities + if not cc.startswith("8.9")) + compute_capabilities.add("8.0+PTX") + if any(cc.startswith("9.0") for cc in compute_capabilities): + raise RuntimeError( + "CUDA 11.8 or higher is required for compute capability 9.0.") -# Add target compute capabilities to NVCC flags. -for capability in compute_capabilities: - num = capability[0] + capability[2] - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] - if capability.endswith("+PTX"): - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"] + # Add target compute capabilities to NVCC flags. + for capability in compute_capabilities: + num = capability[0] + capability[2] + NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] + if capability.endswith("+PTX"): + NVCC_FLAGS += [ + "-gencode", f"arch=compute_{num},code=compute_{num}" + ] -# Use NVCC threads to parallelize the build. -if nvcc_cuda_version >= Version("11.2"): - nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) - num_threads = min(os.cpu_count(), nvcc_threads) - NVCC_FLAGS += ["--threads", str(num_threads)] + # Use NVCC threads to parallelize the build. + if nvcc_cuda_version >= Version("11.2"): + nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) + num_threads = min(os.cpu_count(), nvcc_threads) + NVCC_FLAGS += ["--threads", str(num_threads)] + +elif _is_hip(): + amd_arch = get_amdgpu_offload_arch() + if amd_arch not in ROCM_SUPPORTED_ARCHS: + raise RuntimeError( + f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" + f"amdgpu_arch_found: {amd_arch}") ext_modules = [] + +vllm_extension_sources = [ + "csrc/cache_kernels.cu", + "csrc/attention/attention_kernels.cu", + "csrc/pos_encoding_kernels.cu", + "csrc/activation_kernels.cu", + "csrc/layernorm_kernels.cu", + "csrc/quantization/squeezellm/quant_cuda_kernel.cu", + "csrc/cuda_utils_kernels.cu", + "csrc/pybind.cpp", +] + +if _is_cuda(): + vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") + vllm_extension = CUDAExtension( name="vllm._C", - sources=[ - "csrc/cache_kernels.cu", - "csrc/attention/attention_kernels.cu", - "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/awq/gemm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/pybind.cpp", - ], + sources=vllm_extension_sources, extra_compile_args={ "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS, @@ -183,10 +256,19 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) - cuda_version = str(nvcc_cuda_version) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - version += f"+cu{cuda_version_str}" + + if _is_hip(): + # Get the HIP version + hipcc_version = get_hipcc_rocm_version() + if hipcc_version != MAIN_CUDA_VERSION: + rocm_version_str = hipcc_version.replace(".", "")[:3] + version += f"+rocm{rocm_version_str}" + else: + cuda_version = str(nvcc_cuda_version) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + version += f"+cu{cuda_version_str}" + return version @@ -201,8 +283,12 @@ def read_readme() -> str: def get_requirements() -> List[str]: """Get Python package dependencies from requirements.txt.""" - with open(get_path("requirements.txt")) as f: - requirements = f.read().strip().split("\n") + if _is_hip(): + with open(get_path("requirements-rocm.txt")) as f: + requirements = f.read().strip().split("\n") + else: + with open(get_path("requirements.txt")) as f: + requirements = f.read().strip().split("\n") return requirements diff --git a/vllm/config.py b/vllm/config.py index cd92d361..a2739e5f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -6,7 +6,7 @@ from transformers import PretrainedConfig from vllm.logger import init_logger from vllm.transformers_utils.config import get_config -from vllm.utils import get_cpu_memory +from vllm.utils import get_cpu_memory, is_hip logger = init_logger(__name__) @@ -98,12 +98,27 @@ class ModelConfig: def _verify_load_format(self) -> None: load_format = self.load_format.lower() - if load_format not in [ - "auto", "pt", "safetensors", "npcache", "dummy" - ]: + supported_load_format = [ + "auto", "pt", "safetensors", "npcache", "dummy" + ] + rocm_not_supported_load_format = ["safetensors"] + if load_format not in supported_load_format: raise ValueError( f"Unknown load format: {self.load_format}. Must be one of " "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.") + if is_hip(): + if load_format in ["safetensors"]: + rocm_supported_load_format = [ + f for f in supported_load_format + if (f not in rocm_not_supported_load_format) + ] + raise ValueError( + f"load format \'{load_format}\' is not supported in ROCm. " + f"Supported load format are " + f"{rocm_supported_load_format}") + # Force ROCm to load from pt weights if nothing specific is set + if load_format == "auto": + load_format = "pt" self.load_format = load_format def _verify_tokenizer_mode(self) -> None: @@ -116,6 +131,7 @@ class ModelConfig: def _verify_quantization(self) -> None: supported_quantization = ["awq", "squeezellm"] + rocm_not_supported_quantization = ["awq"] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -137,6 +153,11 @@ class ModelConfig: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") + if is_hip( + ) and self.quantization in rocm_not_supported_quantization: + raise ValueError( + f"{self.quantization} quantization is currently not supported " + f"in ROCm.") logger.warning(f"{self.quantization} quantization is not fully " "optimized yet. The speed can be slower than " "non-quantized models.") @@ -364,6 +385,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = { "bfloat16": torch.bfloat16, } +_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"] + def _get_and_verify_dtype( config: PretrainedConfig, @@ -393,6 +416,14 @@ def _get_and_verify_dtype( else: raise ValueError(f"Unknown dtype: {dtype}") + if is_hip() and torch_dtype == torch.float32: + rocm_supported_dtypes = [ + k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() + if (k not in _ROCM_NOT_SUPPORTED_DTYPE) + ] + raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " + f"Supported dtypes are {rocm_supported_dtypes}") + # Verify the dtype. if torch_dtype != config_dtype: if torch_dtype == torch.float32: diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 04660926..e101940f 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -3,6 +3,7 @@ from typing import Optional, Tuple, TYPE_CHECKING from vllm.config import ParallelConfig from vllm.logger import init_logger +from vllm.utils import is_hip logger = init_logger(__name__) @@ -73,7 +74,12 @@ def initialize_cluster( "Ray is not installed. Please install Ray to use distributed " "serving.") # Connect to a ray cluster. - ray.init(address=ray_address, ignore_reinit_error=True) + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) if not parallel_config.worker_use_ray: # Initialize cluster locally. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index b84af362..3f4ecb5d 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -10,6 +10,7 @@ from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, from vllm._C import ops from vllm._C import cache_ops from vllm.model_executor.input_metadata import InputMetadata +from vllm.utils import is_hip _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. @@ -160,6 +161,8 @@ class PagedAttention(nn.Module): attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, ) output = out.view_as(query) else: diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index aa6bd065..f2f9cac6 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -7,6 +7,7 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.utils import is_hip class SqueezeLLMConfig(QuantizationConfig): @@ -114,9 +115,14 @@ class SqueezeLLMLinearMethod(LinearMethodBase): lookup_table = weights["lookup_table"] out_shape = x.shape[:-1] + (qweight.shape[-1], ) reshaped_x = x.reshape(-1, x.shape[-1]) - # NOTE: The output tensor should be zero-initialized. - out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) - ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) + if is_hip(): + out_f = torch.zeros(out_shape, device="cuda", dtype=torch.float) + ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table) + out = out_f.to(dtype=torch.float16) + else: + # NOTE: The output tensor should be zero-initialized. + out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) + ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) if bias is not None: out = out + bias diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index 14739d95..ced05b18 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -10,6 +10,10 @@ from vllm.config import ModelConfig from vllm.model_executor.models import * from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights) +from vllm.utils import is_hip +from vllm.logger import init_logger + +logger = init_logger(__name__) # TODO(woosuk): Lazy-load the model classes. _MODEL_REGISTRY = { @@ -39,6 +43,18 @@ _MODEL_REGISTRY = { "YiForCausalLM": YiForCausalLM, } +# Models to be disabled in ROCm +_ROCM_UNSUPPORTED_MODELS = [] +if is_hip(): + for rocm_model in _ROCM_UNSUPPORTED_MODELS: + del _MODEL_REGISTRY[rocm_model] + +# Models partially supported in ROCm +_ROCM_PARTIALLY_SUPPORTED_MODELS = { + "MistralForCausalLM": + "Sliding window attention is not supported in ROCm's flash attention", +} + @contextlib.contextmanager def _set_default_torch_dtype(dtype: torch.dtype): @@ -53,7 +69,15 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: architectures = getattr(config, "architectures", []) for arch in architectures: if arch in _MODEL_REGISTRY: + if is_hip() and arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + logger.warning( + f"{arch} is not fully supported in ROCm. Reason: " + f"{_ROCM_PARTIALLY_SUPPORTED_MODELS[arch]}") return _MODEL_REGISTRY[arch] + elif arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError( + f"Model architecture {arch} is not supported by ROCm for now. \n" + f"Supported architectures {list(_MODEL_REGISTRY.keys())}") raise ValueError( f"Model architectures {architectures} are not supported for now. " f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") diff --git a/vllm/utils.py b/vllm/utils.py index 47e51048..d5d8d4ef 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -27,10 +27,14 @@ class Counter: self.counter = 0 +def is_hip() -> bool: + return torch.version.hip is not None + + def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 + cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 max_shared_mem = cuda_utils.get_device_attribute( cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu) return int(max_shared_mem)