Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
build_kernel/
build_kernel_*
build/
flash_attn.egg-info/
dist/
*.egg
*.egg-info
__pycache__/
*.so
*.dylib
*.pyd
*.log
97 changes: 69 additions & 28 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,27 @@ endif()

option(BUILD_WITH_CPP "Build cpp library" true)
option(BUILD_WITH_KERNEL "Build kernel" true)
option(FAST_BUILD "Fast build" OFF)
option(GEN_KERNEL "Generate kernel" OFF)
option(BUILD_WITH_HOST "Build host static library" true)
option(BUILD_WITH_BWD_KERNEL "Build backward kernels" OFF)
option(FWD_ENABLE_LOCAL "Build forward local-attention kernel variants" OFF)
option(FWD_ENABLE_ALIBI "Build forward ALiBi kernel variants" OFF)
option(FWD_ENABLE_SOFTCAP "Build forward softcap kernel variants" OFF)
option(FWD_ENABLE_APPENDKV "Build forward append-KV kernel variants" OFF)
option(FWD_ENABLE_CAUSAL "Build forward causal kernel variants" OFF)
set(FWD_MN_LIST "DEFAULT" CACHE STRING "Comma-separated xcore1000 fwd tile list, or DEFAULT for the default dispatch tiles")
set(FWD_SPLIT_MN_LIST "DEFAULT" CACHE STRING "Comma-separated xcore1000 fwd_split tile list, or DEFAULT for the default dispatch tiles")

message(STATUS "BUILD_WITH_CPP:${BUILD_WITH_CPP}")
message(STATUS "BUILD_WITH_KERNEL:${BUILD_WITH_KERNEL}")
message(STATUS "FAST_BUILD:${FAST_BUILD}")
message(STATUS "GEN_KERNEL:${GEN_KERNEL}")
message(STATUS "BUILD_WITH_HOST:${BUILD_WITH_HOST}")
message(STATUS "BUILD_WITH_BWD_KERNEL:${BUILD_WITH_BWD_KERNEL}")
message(STATUS "FWD_MN_LIST:${FWD_MN_LIST}")
message(STATUS "FWD_SPLIT_MN_LIST:${FWD_SPLIT_MN_LIST}")
message(STATUS "FWD_ENABLE_LOCAL:${FWD_ENABLE_LOCAL}")
message(STATUS "FWD_ENABLE_ALIBI:${FWD_ENABLE_ALIBI}")
message(STATUS "FWD_ENABLE_SOFTCAP:${FWD_ENABLE_SOFTCAP}")
message(STATUS "FWD_ENABLE_APPENDKV:${FWD_ENABLE_APPENDKV}")
message(STATUS "FWD_ENABLE_CAUSAL:${FWD_ENABLE_CAUSAL}")

# set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time")
# set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "../compile_time.sh")
Expand Down Expand Up @@ -99,22 +113,52 @@ add_compile_definitions(USE_MACA)
add_compile_definitions(NV_ARCH_A100)
add_compile_definitions(__FAST_HALF_CVT__)
add_compile_definitions(__MERGE_LDS_B64)

if(NOT DEFINED HDIM)
set(HDIM 0)
if(NOT BUILD_WITH_BWD_KERNEL)
add_compile_definitions(FLASHATTENTION_DISABLE_BACKWARD)
add_compile_definitions(FLASHATTENTION_DISABLE_DROPOUT)
add_compile_definitions(DROPOUT_FALSE)
endif()
if(NOT FWD_ENABLE_LOCAL)
add_compile_definitions(FLASHATTENTION_DISABLE_LOCAL)
add_compile_definitions(LOCAL_FALSE)
endif()
if(NOT FWD_ENABLE_ALIBI)
add_compile_definitions(FLASHATTENTION_DISABLE_ALIBI)
add_compile_definitions(ALIBI_FALSE)
endif()
if(NOT FWD_ENABLE_SOFTCAP)
add_compile_definitions(FLASHATTENTION_DISABLE_SOFTCAP)
add_compile_definitions(SOFTCAP_FALSE)
endif()
if(NOT FWD_ENABLE_APPENDKV)
add_compile_definitions(FLASHATTENTION_DISABLE_APPENDKV)
add_compile_definitions(APPENDKV_FALSE)
endif()
if(NOT FWD_ENABLE_CAUSAL)
add_compile_definitions(FLASHATTENTION_DISABLE_CAUSAL)
add_compile_definitions(CAUSAL_FALSE)
endif()

if(NOT DEFINED FA_TYPE)
set(FA_TYPE "ALL")
endif()
message(STATUS "HDIM=${HDIM},FA_TYPE=${FA_TYPE}")

STRING(TOUPPER "${FA_TYPE}" FA_TYPE_UPPER)

if(${HDIM} STREQUAL "0" AND NOT ${FA_TYPE_UPPER} STREQUAL "ALL")
MESSAGE(STATUS "Setting DTYPE to ALL because HDIM is 0")
set(FA_TYPE_UPPER "ALL")
if(NOT DEFINED HDIM)
if(DEFINED HDIM_CONFIG_LIST AND NOT "${HDIM_CONFIG_LIST}" STREQUAL "")
# Internal multi-hdim sentinel for source selection. Keep the user-facing
# configure log explicit so it does not look like hdim 0 is being built.
set(HDIM 0)
set(HDIM_STATUS "HDIM_CONFIG_LIST=${HDIM_CONFIG_LIST}")
else()
set(HDIM 0)
set(HDIM_STATUS "HDIM=ALL")
endif()
else()
set(HDIM_STATUS "HDIM=${HDIM}")
endif()
message(STATUS "${HDIM_STATUS},FA_TYPE=${FA_TYPE}")

STRING(TOUPPER "${FA_TYPE}" FA_TYPE_UPPER)

if (${FA_TYPE_UPPER} STREQUAL "ALL")
add_compile_definitions(FA_DTYPE_ALL)
Expand All @@ -130,7 +174,16 @@ if(NOT HDIM STREQUAL "0")
add_compile_definitions(HDIM_CONFIG=${HDIM})
add_compile_definitions(HDIM_${HDIM})
else()
add_compile_definitions(HDIM_ALL)
if(DEFINED HDIM_CONFIG_LIST AND NOT "${HDIM_CONFIG_LIST}" STREQUAL "")
add_compile_definitions("HDIM_CONFIG=${HDIM_CONFIG_LIST}")
string(REPLACE "," ";" HDIM_CONFIG_ITEMS "${HDIM_CONFIG_LIST}")
foreach(HDIM_CONFIG_ITEM IN LISTS HDIM_CONFIG_ITEMS)
string(STRIP "${HDIM_CONFIG_ITEM}" HDIM_CONFIG_ITEM)
add_compile_definitions(HDIM_${HDIM_CONFIG_ITEM})
endforeach()
else()
add_compile_definitions(HDIM_ALL)
endif()
endif()

include_directories(
Expand Down Expand Up @@ -158,21 +211,9 @@ if (BUILD_WITH_KERNEL)

STRING(TOLOWER "${FA_TYPE}" FA_TYPE_LOWER)

if(FAST_BUILD AND GEN_KERNEL)
execute_process(
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/run_generator.sh capi ${HDIM} ${FA_TYPE_LOWER} ${MACA_ARCH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE result
OUTPUT_VARIABLE output
ERROR_VARIABLE error
)

if(NOT result EQUAL 0)
message(ERROR ": Generate kernel files failed, output:\n${error}")
endif()
if(BUILD_WITH_HOST)
build_flash_attn_host(mcFlashAttnHostStatic)
endif()

build_flash_attn_host(mcFlashAttnHostStatic)
if(${MACA_ARCH} STREQUAL "xcore1000")
build_flash_attn_kernel(mcFlashAttnKernelXcore1000Static "xcore1000" ${HDIM} ${FA_TYPE})
target_compile_options(mcFlashAttnKernelXcore1000Static PRIVATE --offload-arch=xcore1000)
Expand Down
116 changes: 91 additions & 25 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,36 +1,96 @@
CURRENT_CPU_NUM:=$(shell grep -c processor /proc/cpuinfo)
CURRENT_CPU_NUM:=$(shell grep "cpu cores" /proc/cpuinfo | head -1 | awk '{print $$4}')
# If in maca build, restrict num_jobs to 140 to resolve x86 docker which using 384 cores leads to OOM
ifdef BUILDROOT
# In x86 docker maca building, "make -j112" is a stable parallel number, so we use 140 here to get 140*0.8=112
CURRENT_CPU_NUM:=$(shell awk -v a=$(CURRENT_CPU_NUM) 'BEGIN {if(a>140) print 140; else print a}')
endif
MHA_NUM_JOBS:=$(shell awk -v n=$(CURRENT_CPU_NUM) 'BEGIN {print int(n * 0.8)}')
HDIM ?= 0
DTYPE ?= all
# FAST_BUILD is not currently supported.
FAST_BUILD ?= 0
GEN_KERNEL ?= 0
MHA_NUM_JOBS:=$(shell awk -v n=$(CURRENT_CPU_NUM) 'BEGIN {print int(n*0.8)}')
Comment on lines +1 to +7

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using grep "cpu cores" /proc/cpuinfo to determine the CPU count is fragile and can fail or return empty in virtualized, containerized, or non-x86 environments. If it returns empty, MHA_NUM_JOBS evaluates to 0, which causes make -j0 to fail with a syntax error.\n\nUsing nproc is much more robust and standard across Linux environments. We should also ensure MHA_NUM_JOBS is at least 1 to prevent invalid -j0 build options.

CURRENT_CPU_NUM:=$(shell nproc 2>/dev/null || echo 4)\n# If in maca build, restrict num_jobs to 140 to resolve x86 docker which using 384 cores leads to OOM\nifdef BUILDROOT\n# In x86 docker maca building, "make -j112" is a stable parallel number, so we use 140 here to get 140*0.8=112\n\tCURRENT_CPU_NUM:=$(shell awk -v a=$(CURRENT_CPU_NUM) 'BEGIN {if(a>140) print 140; else print a}')\nendif\nMHA_NUM_JOBS:=$(shell awk -v n=$(CURRENT_CPU_NUM) 'BEGIN {jobs=int(n*0.8); print (jobs>1?jobs:1)}')

# HDIM_LIST ?= 128
HDIM_LIST ?= 128 256
empty :=
space := $(empty) $(empty)
comma := ,
HDIM_CONFIG_LIST := $(subst $(space),$(comma),$(strip $(HDIM_LIST)))
DTYPE ?= BF16
BUILD_WITH_BWD_KERNEL ?= FALSE
FWD_MN_LIST ?= DEFAULT
FWD_SPLIT_MN_LIST ?= DEFAULT
FWD_ENABLE_LOCAL ?= FALSE
FWD_ENABLE_ALIBI ?= FALSE
FWD_ENABLE_SOFTCAP ?= FALSE
FWD_ENABLE_APPENDKV ?= FALSE
FWD_ENABLE_CAUSAL ?= FALSE
SUB_MODULE ?= fused_dense_lib
BUILD_PROJECTS_SCRIPT_PATH := ./tools/build_scripts/build_projects_related.sh
TORCH_EXTENSION_SCRIPT_PATH := ./tools/build_scripts/torch_extension_related.sh
run_build_projects_script_%:
@$(BUILD_PROJECTS_SCRIPT_PATH) $* || (echo "Execution failed with code $$?")
@chmod +x $(BUILD_PROJECTS_SCRIPT_PATH) && $(BUILD_PROJECTS_SCRIPT_PATH) $* || (echo "Execution failed with code $$?")
run_torch_extension_script_%:
@$(TORCH_EXTENSION_SCRIPT_PATH) $* || (echo "Execution failed with code $$?")
@chmod +x $(TORCH_EXTENSION_SCRIPT_PATH) && $(TORCH_EXTENSION_SCRIPT_PATH) $* || (echo "Execution failed with code $$?")

kernel:
mkdir -p build_kernel
cd build_kernel \
&& cmake \
-DMACA_PATH=${MACA_PATH} \
-DBUILD_WITH_KERNEL=TRUE \
-DBUILD_WITH_CPP=FALSE \
-DHDIM=${HDIM} \
-DFAST_BUILD=${FAST_BUILD} \
-DGEN_KERNEL=$(GEN_KERNEL) \
-DFA_TYPE=${DTYPE} \
.. \
&& make -j$(MHA_NUM_JOBS)
@mkdir -p build_kernel build_host
@cd build_host && \
cmake \
-DMACA_PATH=${MACA_PATH} \
-DBUILD_WITH_KERNEL=TRUE \
-DBUILD_WITH_CPP=FALSE \
-DBUILD_WITH_HOST=TRUE \
-DBUILD_WITH_BWD_KERNEL=${BUILD_WITH_BWD_KERNEL} \
-DFWD_MN_LIST=${FWD_MN_LIST} \
-DFWD_SPLIT_MN_LIST=${FWD_SPLIT_MN_LIST} \
-DFWD_ENABLE_LOCAL=${FWD_ENABLE_LOCAL} \
-DFWD_ENABLE_ALIBI=${FWD_ENABLE_ALIBI} \
-DFWD_ENABLE_SOFTCAP=${FWD_ENABLE_SOFTCAP} \
-DFWD_ENABLE_APPENDKV=${FWD_ENABLE_APPENDKV} \
-DFWD_ENABLE_CAUSAL=${FWD_ENABLE_CAUSAL} \
-DHDIM_CONFIG_LIST=${HDIM_CONFIG_LIST} \
-DFA_TYPE=${DTYPE} \
.. && \
make -j$(MHA_NUM_JOBS) mcFlashAttnHostStatic || exit 1; \
mv libmcFlashAttnHostStatic.a ../build_kernel/; \
cd .. && rm -rf build_host
@for hd in $(HDIM_LIST); do \
mkdir -p build_kernel_$$hd && \
cd build_kernel_$$hd && \
cmake \
-DMACA_PATH=${MACA_PATH} \
-DBUILD_WITH_KERNEL=TRUE \
-DBUILD_WITH_CPP=FALSE \
-DBUILD_WITH_HOST=FALSE \
-DHDIM=$$hd \
-DBUILD_WITH_BWD_KERNEL=${BUILD_WITH_BWD_KERNEL} \
-DFWD_MN_LIST=${FWD_MN_LIST} \
-DFWD_SPLIT_MN_LIST=${FWD_SPLIT_MN_LIST} \
-DFWD_ENABLE_LOCAL=${FWD_ENABLE_LOCAL} \
-DFWD_ENABLE_ALIBI=${FWD_ENABLE_ALIBI} \
-DFWD_ENABLE_SOFTCAP=${FWD_ENABLE_SOFTCAP} \
-DFWD_ENABLE_APPENDKV=${FWD_ENABLE_APPENDKV} \
-DFWD_ENABLE_CAUSAL=${FWD_ENABLE_CAUSAL} \
-DFA_TYPE=${DTYPE} \
.. && \
make -j$(MHA_NUM_JOBS) || exit 1; \
cd .. || exit 1; \
done
@cd build_kernel && \
for arch in Xcore1000 Xcore1500; do \
if ! ls ../build_kernel_*/libmcFlashAttnKernel$${arch}Static.a 1> /dev/null 2>&1; then \
echo "Skipping $$arch - not found"; \
continue; \
fi; \
rm -rf tmp_$$arch && mkdir -p tmp_$$arch && cd tmp_$$arch && \
for hd in $(HDIM_LIST); do \
if [ -f ../../build_kernel_$$hd/libmcFlashAttnKernel$${arch}Static.a ]; then \
ar x ../../build_kernel_$$hd/libmcFlashAttnKernel$${arch}Static.a && echo "Extracted from $$hd"; \
else \
echo "Warning: build_kernel_$$hd/libmcFlashAttnKernel$${arch}Static.a not found"; \
fi; \
done && \
ls -la *.o 2>/dev/null | head -5 && \
rm -f ../libmcFlashAttnKernel$${arch}Static.a && \
ar -r ../libmcFlashAttnKernel$${arch}Static.a *.o && \
cd .. && rm -rf tmp_$$arch; \
done

cplus_api: run_build_projects_script_sdk kernel
mkdir -p build_cpp
Expand All @@ -42,13 +102,19 @@ cplus_api: run_build_projects_script_sdk kernel
-DBUILD_WITH_CPP=TRUE \
-DCMAKE_INSTALL_PREFIX=./install \
-DHDIM=${HDIM} \
-DFAST_BUILD=${FAST_BUILD} \
-DGEN_KERNEL=$(GEN_KERNEL) \
-DBUILD_WITH_BWD_KERNEL=${BUILD_WITH_BWD_KERNEL} \
-DFWD_MN_LIST=${FWD_MN_LIST} \
-DFWD_SPLIT_MN_LIST=${FWD_SPLIT_MN_LIST} \
-DFWD_ENABLE_LOCAL=${FWD_ENABLE_LOCAL} \
-DFWD_ENABLE_ALIBI=${FWD_ENABLE_ALIBI} \
-DFWD_ENABLE_SOFTCAP=${FWD_ENABLE_SOFTCAP} \
-DFWD_ENABLE_APPENDKV=${FWD_ENABLE_APPENDKV} \
-DFWD_ENABLE_CAUSAL=${FWD_ENABLE_CAUSAL} \
-DFA_TYPE=${DTYPE} \
.. && make -j$(MHA_NUM_JOBS) && make install

python: run_build_projects_script_pytorch kernel
python ./setup.py bdist_wheel ; \
BUILD_WITH_BWD_KERNEL=${BUILD_WITH_BWD_KERNEL} FWD_ENABLE_LOCAL=${FWD_ENABLE_LOCAL} FWD_ENABLE_ALIBI=${FWD_ENABLE_ALIBI} FWD_ENABLE_SOFTCAP=${FWD_ENABLE_SOFTCAP} FWD_ENABLE_APPENDKV=${FWD_ENABLE_APPENDKV} FWD_ENABLE_CAUSAL=${FWD_ENABLE_CAUSAL} python ./setup.py bdist_wheel ; \

mla: run_build_projects_script_pytorch
mkdir -p dist
Expand All @@ -64,7 +130,7 @@ clean_mla:
rm -rf ./csrc/flash_mla/build

clean_kernel:
rm -rf ./build_kernel
rm -rf ./build_kernel*

clean_capi:
rm -rf ./build_cpp
Expand Down
83 changes: 77 additions & 6 deletions README_MX.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,86 @@ make cplus_api
make kernel
```

### Fast build (!Currently Unavailable)
Specify hdim and dtype, and compile only the specified combinations of bool switches based on the configuration in `tools/generator/bool_switch.ini`. Refer to the comments in the file for the configuration of `bool_switch.ini`.
### Fast build

Use `DEFAULT` to compile only the default dispatch MN tiles. Forward feature variants are disabled by default to reduce build time. Backward kernels are also disabled by default; because dropout forward is only useful together with backward in this build flow, dropout forward kernels are disabled when `BUILD_WITH_BWD_KERNEL=FALSE`.

Running `make python` with no extra options uses these defaults:

| Option | Default used by `make python` | Effect when changed |
| --- | --- | --- |
| `FLASHATTN_BUILD_PROJECTS` | unset | If unset, build both C500 and C600. Set `FLASHATTN_BUILD_PROJECTS=C500` or `C600` to build one architecture. |
| `HDIM_LIST` | `128 256` | Select the head dimensions to compile. |
| `DTYPE` | `BF16` | Select the dtype to compile. |
| `FWD_MN_LIST` | `DEFAULT` | Select xcore1000/xcore1500 fwd MN tiles; `DEFAULT` means the dispatch default tiles for each architecture. |
| `FWD_SPLIT_MN_LIST` | `DEFAULT` | Select xcore1000/xcore1500 fwd_split MN tiles; `DEFAULT` means the dispatch default tiles for each architecture. |
| `BUILD_WITH_BWD_KERNEL` | `FALSE` | Set to `TRUE` to build backward kernels and enable backward API support. |
| `FWD_ENABLE_LOCAL` | `FALSE` | Set to `TRUE` to build local/sliding-window forward variants. |
| `FWD_ENABLE_ALIBI` | `FALSE` | Set to `TRUE` to build ALiBi forward variants. |
| `FWD_ENABLE_SOFTCAP` | `FALSE` | Set to `TRUE` to build softcap forward variants. |
| `FWD_ENABLE_APPENDKV` | `FALSE` | Set to `TRUE` to build append-KV variants for `flash_attn_with_kvcache`. |
| `FWD_ENABLE_CAUSAL` | `FALSE` | Set to `TRUE` to build causal forward variants. |

With these defaults, `make python` builds BF16 forward-only kernels for hdim 128 and 256, uses only the default dispatch MN tiles, disables backward/dropout, and excludes local, ALiBi, softcap, append-KV, and causal forward variants. The default MN tiles are:

| arch | hdim | fwd default MN | fwd_split default MN |
| --- | --- | --- | --- |
| xcore1000 | 128 | 64x64 | 64x64 |
| xcore1000 | 256 | 64x32 | 64x64 |
| xcore1500 | 128 | 128x64 | 16x32, 128x64 |
| xcore1500 | 256 | 128x64 | 128x64 |

Enable only the variants needed by your workload:
```bash
# build only C500
FLASHATTN_BUILD_PROJECTS=C500 make python

# build backward and dropout-capable forward kernels
make python BUILD_WITH_BWD_KERNEL=TRUE

# support causal=True
make python FWD_ENABLE_CAUSAL=TRUE

# support local attention and ALiBi
make python FWD_ENABLE_LOCAL=TRUE FWD_ENABLE_ALIBI=TRUE

# support append KV in flash_attn_with_kvcache
make python FWD_ENABLE_APPENDKV=TRUE

# enable multiple variants together
make python BUILD_WITH_BWD_KERNEL=TRUE FWD_ENABLE_CAUSAL=TRUE FWD_ENABLE_LOCAL=TRUE
```
# fast build with generate kernel
make python HDIM=128 DTYPE=FP16 FAST_BUILD=1 GEN_KERNEL=1
# fast build without generate kernel
make python HDIM=128 DTYPE=FP16 FAST_BUILD=1 GEN_KERNEL=0

Override `FWD_MN_LIST` and `FWD_SPLIT_MN_LIST` to include more forward tiles:
```bash
make python FWD_MN_LIST=64x32,64x64 FWD_SPLIT_MN_LIST=64x64
```

The generated xcore1000 sources currently provide these MN choices:

| hdim | fwd MN choices | fwd_split MN choices | dispatch default |
| --- | --- | --- | --- |
| 32 | 128x64, 128x128 | 64x64 | fwd: 128x128; fwd_split: 64x64 |
| 64 | 16x16, 32x32, 64x64, 128x64, 128x128 | 16x16, 64x64 | fwd: 64x64; fwd_split: 64x64 |
| 96 | 64x64, 128x64 | 64x64 | fwd: 128x64; fwd_split: 64x64 |
| 128 | 64x32, 64x64, 128x32, 128x64 | 16x16, 32x32, 64x32, 64x64, 128x64 | fwd: 64x64; fwd_split: 64x64 |
| 160 | 64x32, 64x64, 128x64 | 64x64 | fwd: 64x32; fwd_split: 64x64 |
| 192 | 64x64; 128x64 for hdimv128 | 64x64 | fwd: 64x64 and 128x64 for hdimv128; fwd_split: 64x64 |
| 256 | 64x32, 64x64 | 64x32, 64x64 | fwd: 64x32 without dropout, 64x64 for dropout; fwd_split: 64x64 |
| 512 | 64x32 | 32x32 | fwd: 64x32; fwd_split: 32x32 |

The generated xcore1500 sources currently provide these MN choices:

| hdim | fwd MN choices | fwd_split MN choices | dispatch default |
| --- | --- | --- | --- |
| 32 | 128x64, 128x128 | 64x64 | fwd: 128x64 and 128x128; fwd_split: 64x64 |
| 64 | 128x64 | 128x64 | fwd: 128x64; fwd_split: 128x64 |
| 96 | 128x64 | 64x64 | fwd: 128x64; fwd_split: 64x64 |
| 128 | 128x64 | 16x32, 128x64 | fwd: 128x64; fwd_split: 16x32 for short seqlen_q, 128x64 otherwise |
| 160 | 128x64 | 64x64 | fwd: 128x64; fwd_split: 64x64 |
| 192 | 128x64 | 128x64 | fwd: 128x64; fwd_split: 128x64 |
| 256 | 128x64 | 128x64 | fwd: 128x64; fwd_split: 128x64 |

### ‌Multi-SKU build

The build process can be controlled through the environment variable `FLASHATTN_BUILD_PROJECTS`:
Expand Down
Loading