vllm/cmake/external_projects/vllm_flash_attn.cmake

# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})
    string(REPLACE "." "" _ARCH "${_ARCH}")
    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
  endforeach()
endif()

#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.

# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()

if(VLLM_FLASH_ATTN_SRC_DIR)
  FetchContent_Declare(
          vllm-flash-attn SOURCE_DIR 
          ${VLLM_FLASH_ATTN_SRC_DIR}
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
          GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
  )
endif()


# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")

# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa2_C
  FILES_MATCHING PATTERN "*.py"
)

install(
  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
  DESTINATION vllm_flash_attn
  COMPONENT _vllm_fa3_C
  FILES_MATCHING PATTERN "*.py"
)
[Kernel] FlashMLA integration (#13747) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> 2025-02-26 21:35:08 -05:00			`# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target`
			`# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the`
			`# arches in the CUDA case (and instead set the gencodes on a per file basis)`
			`# we need to manually set VLLM_GPU_ARCHES here.`
			`if(VLLM_GPU_LANG STREQUAL "CUDA")`
			`foreach(_ARCH ${CUDA_ARCHS})`
			`string(REPLACE "." "" _ARCH "${_ARCH}")`
			`list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")`
			`endforeach()`
			`endif()`

			`#`
			`# Build vLLM flash attention from source`
			`#`
			`# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.`
			`# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.`
			`# They should be identical but if they aren't, this is a massive footgun.`
			`#`
			`# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.`
			`# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).`
			`# If no component is specified, vllm-flash-attn is still installed.`

			`# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.`
			`# This is to enable local development of vllm-flash-attn within vLLM.`
			`# It can be set as an environment variable or passed as a cmake argument.`
			`# The environment variable takes precedence.`
			`if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})`
			`set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})`
			`endif()`

			`if(VLLM_FLASH_ATTN_SRC_DIR)`
			`FetchContent_Declare(`
			`vllm-flash-attn SOURCE_DIR`
			`${VLLM_FLASH_ATTN_SRC_DIR}`
			`BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn`
			`)`
			`else()`
			`FetchContent_Declare(`
			`vllm-flash-attn`
			`GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git`
[Attention] Update to lastest FA3 code (#13111) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2025-04-17 18:14:07 -04:00			`GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa`
[Kernel] FlashMLA integration (#13747) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> 2025-02-26 21:35:08 -05:00			`GIT_PROGRESS TRUE`
			`# Don't share the vllm-flash-attn build between build types`
			`BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn`
			`)`
			`endif()`


			`# Fetch the vllm-flash-attn library`
			`FetchContent_MakeAvailable(vllm-flash-attn)`
			`message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")`

			`# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in`
			`# case only one is built, in the case both are built redundant work is done)`
			`install(`
			`DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/`
			`DESTINATION vllm_flash_attn`
			`COMPONENT _vllm_fa2_C`
			`FILES_MATCHING PATTERN "*.py"`
			`)`

			`install(`
			`DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/`
			`DESTINATION vllm_flash_attn`
			`COMPONENT _vllm_fa3_C`
			`FILES_MATCHING PATTERN "*.py"`
[Hardware] Update the flash attn tag to support Blackwell (#14244) 2025-03-05 22:01:37 -08:00			`)`