# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target # arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the # arches in the CUDA case (and instead set the gencodes on a per file basis) # we need to manually set VLLM_GPU_ARCHES here. if(VLLM_GPU_LANG STREQUAL "CUDA") foreach(_ARCH ${CUDA_ARCHS}) string(REPLACE "." "" _ARCH "${_ARCH}") list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") endforeach() endif() # # Build vLLM flash attention from source # # IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM. # Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs. # They should be identical but if they aren't, this is a massive footgun. # # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. # To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). # If no component is specified, vllm-flash-attn is still installed. # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. # This is to enable local development of vllm-flash-attn within vLLM. # It can be set as an environment variable or passed as a cmake argument. # The environment variable takes precedence. if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR}) endif() if(VLLM_FLASH_ATTN_SRC_DIR) FetchContent_Declare( vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR} BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # case only one is built, in the case both are built redundant work is done) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ DESTINATION vllm_flash_attn COMPONENT _vllm_fa2_C FILES_MATCHING PATTERN "*.py" ) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ DESTINATION vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" )