diff --git a/backend/npu.py b/backend/npu.py index 610cefae..0c89c723 100644 --- a/backend/npu.py +++ b/backend/npu.py @@ -1430,7 +1430,7 @@ def _format_of(ty): name.append(kernelName); {'auto launch_call = [=]()' if enable_taskqueue else ''} {{ uint32_t blockNum = gridX * gridY * gridZ; - {'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl;if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'} + {'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { /* std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl; */ if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'} {'blockNum = std::min(blockNum, (uint32_t)' + str(num_physical_blocks) + ');' if enable_auto_map_parallel_blocks else ''} {'cce::internal::DebugTunnelData *DTData = cce::internal::DebugTunnel::Open(blockNum);' if enable_device_print else ''} diff --git a/compile_shared.sh b/compile_shared.sh index d5fdd058..945b9d9e 100644 --- a/compile_shared.sh +++ b/compile_shared.sh @@ -117,3 +117,17 @@ else fi fi notify_apply_patch + +# ============================================================================ +# Compile bitcode libraries for dl.custom() custom ops +# ============================================================================ +if command -v ccec &> /dev/null; then + echo "Compiling bitcode libraries for custom ops..." + BITCODE_DIR="$home_path/dlcompiler/bitcode" + if [ -f "$BITCODE_DIR/compile_bc.sh" ]; then + bash "$BITCODE_DIR/compile_bc.sh" || echo "Warning: bitcode compilation failed (non-fatal)" + fi +else + echo "ccec not found, skipping bitcode compilation." + echo " Install CANN toolkit or ensure ccec is in PATH to enable." +fi diff --git a/dlcompiler/bitcode/compile_bc.sh b/dlcompiler/bitcode/compile_bc.sh new file mode 100755 index 00000000..aa01cc7a --- /dev/null +++ b/dlcompiler/bitcode/compile_bc.sh @@ -0,0 +1,171 @@ +# ============================================================================ +# Compile all src/*.cpp → bc/*.aiv.bc (bitcode library for dl.custom()) +# +# 编译器: ccec (Ascend CANN CCE compiler) +# 架构: dav-c220-vec (Ascend 910B2 vector core) +# +# 用法: +# bash compile_bc.sh 编译所有 +# bash compile_bc.sh add 只编译 add.cpp +# bash compile_bc.sh softmax 只编译 softmax_ops.cpp +# bash compile_bc.sh -f 强制重新编译全部 +# ============================================================================ + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SRC_DIR="$SCRIPT_DIR/src" +# Output to language/deeplink/bitcode/bc/ so .bc files are auto-installed with pip +BC_DIR="$SCRIPT_DIR/../../language/deeplink/bitcode/bc" + +# ============================================================================ +# 1. 检测 CANN 安装路径 +# ============================================================================ +detect_cann_path() { + local cann_path="" + + # 优先级 1: CANN_PATH 环境变量 + if [ -n "${CANN_PATH:-}" ] && [ -d "$CANN_PATH" ]; then + cann_path="$CANN_PATH" + # 优先级 2: ASCEND_HOME_PATH 下的 cann 子目录 + elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann" ]; then + cann_path="$ASCEND_HOME_PATH/cann" + elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann-9.0.0" ]; then + cann_path="$ASCEND_HOME_PATH/cann-9.0.0" + # 优先级 3: 自动检测 /usr/local/Ascend/cann-*/ + else + cann_path=$(ls -d /usr/local/Ascend/cann-*/ 2>/dev/null | head -1 || true) + if [ -n "$cann_path" ]; then + cann_path="${cann_path%/}" + fi + fi + + echo "$cann_path" +} + +CANN_HOME=$(detect_cann_path) +if [ -z "$CANN_HOME" ]; then + echo "ERROR: Cannot find CANN installation." + echo " Set CANN_PATH or ASCEND_HOME_PATH environment variable," + echo " or install CANN toolkit under /usr/local/Ascend/" + exit 1 +fi +echo "CANN_HOME: $CANN_HOME" + +# ============================================================================ +# 2. 检测 ccec 编译器 +# ============================================================================ +CCEC="${CANN_HOME}/bin/ccec" +if [ ! -x "$CCEC" ]; then + echo "ERROR: ccec not found at $CCEC" + exit 1 +fi +echo "CCEC: $CCEC" + +# ============================================================================ +# 3. 编译参数 +# ============================================================================ +# 架构: dav-c220-vec (Ascend 910B2), dav-c100-vec (Ascend 910B1) +AICORE_ARCH="${DLCOMPILER_AICORE_ARCH:-dav-c220-vec}" + +CCEC_FLAGS="-x cce --cce-aicore-arch=${AICORE_ARCH} --cce-aicore-only -c -emit-llvm --std=c++17" + +CCEC_INCLUDES="\ + -I ${CANN_HOME}/asc \ + -I ${CANN_HOME}/aarch64-linux/asc/include/basic_api \ + -I ${CANN_HOME}/aarch64-linux/asc/include/interface \ + -I ${CANN_HOME}/aarch64-linux/ascendc/include/highlevel_api \ + -I ${CANN_HOME}/aarch64-linux/ascendc/include/basic_api/impl \ + -I ${CANN_HOME}/aarch64-linux/ascendc/basic_api \ + -I ${CANN_HOME}/aarch64-linux/ascendc/basic_api/interface \ + -I ${CANN_HOME}/aarch64-linux/ascendc/highlevel_api/lib \ + -I ${CANN_HOME}/aarch64-linux/tiling" + +echo "AICORE_ARCH: $AICORE_ARCH" +echo "INCLUDES:" +echo "$CCEC_INCLUDES" | tr ' ' '\n' | sed 's/^/ /' + +# ============================================================================ +# 4. 确保 bc/ 目录存在 +# ============================================================================ +mkdir -p "$BC_DIR" + +# ============================================================================ +# 5. 编译函数 +# ============================================================================ +compile_one() { + local CPP="$1" + local base_name="$(basename "${CPP%.cpp}")" + local BC="$BC_DIR/${base_name}.aiv.bc" + + if [ -f "$BC" ] && [ "$FORCE" != "true" ]; then + echo "Bitcode file $BC already exists, skipping." + echo " To recompile: rm -f $BC && bash compile_bc.sh" + return + fi + + echo "Compiling $CPP → $BC ..." + ${CCEC} ${CCEC_FLAGS} ${CCEC_INCLUDES} "${CPP}" -o "${BC}" + + if [ $? -eq 0 ]; then + echo " OK: $BC" + # Show exported custom symbols + if command -v nm &> /dev/null; then + echo " Symbols:" + nm -C "${BC}" 2>/dev/null | grep -i custom | sed 's/^/ /' || \ + nm "${BC}" 2>/dev/null | grep -i custom | sed 's/^/ /' || true + fi + else + echo " FAILED: $CPP" + exit 1 + fi +} + +# ============================================================================ +# 6. 解析参数 → 确定编译目标 +# ============================================================================ +FORCE=false +TARGET="" + +for arg in "$@"; do + case "$arg" in + -f|--force) + FORCE=true + ;; + *) + TARGET="$arg" + ;; + esac +done + +cd "$SCRIPT_DIR" + +if [ -n "$TARGET" ]; then + case "$TARGET" in + add) + compile_one "$SRC_DIR/add.cpp" + ;; + softmax) + compile_one "$SRC_DIR/softmax_ops.cpp" + ;; + all) + for CPP in "$SRC_DIR"/*.cpp; do + [ -f "$CPP" ] && compile_one "$CPP" + done + ;; + *) + echo "Unknown target: $TARGET" + echo " Options: add, softmax, softmax_full, all" + exit 1 + ;; + esac +else + # 默认:编译所有 + for CPP in "$SRC_DIR"/*.cpp; do + [ -f "$CPP" ] && compile_one "$CPP" + done +fi + +echo "" +echo "Done. Bitcode files in: $BC_DIR" +ls -lh "$BC_DIR"/*.aiv.bc 2>/dev/null || echo " (no .aiv.bc files)" diff --git a/dlcompiler/bitcode/src/add.cpp b/dlcompiler/bitcode/src/add.cpp new file mode 100644 index 00000000..a4159e7c --- /dev/null +++ b/dlcompiler/bitcode/src/add.cpp @@ -0,0 +1,125 @@ +// ============================================================================ +// DSL Custom Op — add (multi-dtype: int32 / fp32 / fp16) +// +// 作用: 在一个 bitcode 中提供三种 dtype 的 vadd C 接口,供 MLIR 调用。 +// 编译: bash compile.sh → add.aiv.bc +// 符号: +// _mlir_ciface_custom_add_int32 (对应 dl custom_add_int32) +// _mlir_ciface_custom_add_fp32 (对应 dl custom_add_fp32) +// _mlir_ciface_custom_add_fp16 (对应 dl custom_add_fp16) +// +// 共享位码库: ops/skills/triton/triton-dsl-custom-op/bitcode_lib/ +// ============================================================================ + +#define __aiv__ [aicore] +#define INTRINSIC_NO_ARGS(NAME) NAME() +#define INTRINSIC(NAME, ...) NAME(__VA_ARGS__) + +// MLIR memref 结构:表示一块连续内存(指针 + offset + shape + strides) +template struct memref_t { + T *allocated; // 分配基址 + T *aligned; // 对齐后的有效起始地址 + int64_t offset; // 元素偏移量 + int64_t sizes[Dim]; // 各维度长度 + int64_t strides[Dim]; // 各维度步长 +}; + +// vadd 指令参数结构 +template +struct intrin_args { + __ubuf__ DST_T *dst; // 输出指针 + __ubuf__ SRC_T *src[OPERANUM]; // 输入指针数组 + SRC_T scalar; // 标量值(未使用) + uint64_t repeat; // 重复次数 + uint16_t dst_block_stride; // block 内步长 + uint16_t src_block_stride[OPERANUM]; // 输入 block 内步长 + uint16_t dst_repeat_stride; // repeat 间步长 + uint16_t src_repeat_stride[OPERANUM]; // 输入 repeat 间步长 +}; + +// vadd 模板函数:逐元素向量加法 +template +__aiv__ __attribute__((always_inline)) void +vector_eltwise_vadd_intrin(intrin_args<2, SRC_TYPE, DST_TYPE> args) { +#define ELTWISE_VV_ARGS \ + args.dst, args.src[0], args.src[1], args.repeat, args.dst_block_stride, \ + args.src_block_stride[0], args.src_block_stride[1], \ + args.dst_repeat_stride, args.src_repeat_stride[0], \ + args.src_repeat_stride[1] + + // vadd(dst, src0, src1, repeat, dst_bs, src0_bs, src1_bs, dst_rs, src0_rs, + // src1_rs) + INTRINSIC(vadd, ELTWISE_VV_ARGS); +} + +// vadd 调用包装:处理连续访问的公共逻辑 +template +__aiv__ __attribute__((always_inline)) void +vadd_impl(memref_t<__ubuf__ T, 1> *src0, memref_t<__ubuf__ T, 1> *src1, + memref_t<__ubuf__ T, 1> *dst) { + + uint16_t block_stride = 1; + uint16_t repeat_stride = 8; + + auto new_src0_ptr = src0->aligned + src0->offset; + auto new_src1_ptr = src1->aligned + src1->offset; + auto dst_ptr = dst->aligned + dst->offset; + + // 设置向量掩码(处理边界) + INTRINSIC_NO_ARGS(set_mask_count); + const int64_t n = dst->sizes[0]; + INTRINSIC(set_vector_mask, 0, n); + + // 调用 vadd 指令 + vector_eltwise_vadd_intrin( + intrin_args<2, T>{dst_ptr, + {new_src0_ptr, new_src1_ptr}, + 0, // scalar (unused) + 1, // repeat = 1(单次执行,全部元素由 mask 覆盖) + block_stride, + {block_stride, block_stride}, + repeat_stride, + {repeat_stride, repeat_stride}}); + + // 恢复掩码 + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// ============================================================================ +// MLIR 可调用的 C 接口(三种 dtype) +// +// Python 侧 symbol 命名规则: +// str(tl.int32) → "int32" → symbol = "custom_add_int32" +// str(tl.float32) → "fp32" → symbol = "custom_add_fp32" +// str(tl.float16) → "fp16" → symbol = "custom_add_fp16" +// +// MLIR 自动添加 _mlir_ciface_ 前缀: +// Python "custom_add_int32" → C _mlir_ciface_custom_add_int32 +// Python "custom_add_fp32" → C _mlir_ciface_custom_add_fp32 +// Python "custom_add_fp16" → C _mlir_ciface_custom_add_fp16 +// ============================================================================ + +extern "C" { + +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_add_int32(memref_t<__ubuf__ int32_t, 1> *src0, + memref_t<__ubuf__ int32_t, 1> *src1, + memref_t<__ubuf__ int32_t, 1> *dst) { + vadd_impl(src0, src1, dst); +} + +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_add_fp32(memref_t<__ubuf__ float, 1> *src0, + memref_t<__ubuf__ float, 1> *src1, + memref_t<__ubuf__ float, 1> *dst) { + vadd_impl(src0, src1, dst); +} + +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_add_fp16(memref_t<__ubuf__ __fp16, 1> *src0, + memref_t<__ubuf__ __fp16, 1> *src1, + memref_t<__ubuf__ __fp16, 1> *dst) { + vadd_impl(src0, src1, dst); +} + +} // extern "C" diff --git a/dlcompiler/bitcode/src/softmax_ops.cpp b/dlcompiler/bitcode/src/softmax_ops.cpp new file mode 100644 index 00000000..248f47a4 --- /dev/null +++ b/dlcompiler/bitcode/src/softmax_ops.cpp @@ -0,0 +1,150 @@ +// ============================================================================ +// DSL Custom Op — softmax 相关运算 (fp32) +// +// 包含用于 FlashAttention 的 vector 操作: +// vexp: y = exp(x) 单目 +// vdiv: z = x / y 双目 +// vsub: z = x - y 双目 +// vmul: z = x * y 双目 +// +// 编译: bash compile.sh → softmax_ops.aiv.bc +// 符号: +// _mlir_ciface_custom_vexp_fp32 +// _mlir_ciface_custom_vdiv_fp32 +// _mlir_ciface_custom_vsub_fp32 +// _mlir_ciface_custom_vmul_fp32 +// +// 共享位码库: ops/skills/triton/triton-dsl-custom-op/bitcode_lib/ +// ============================================================================ + +#define __aiv__ [aicore] +#define INTRINSIC_NO_ARGS(NAME) NAME() +#define INTRINSIC(NAME, ...) NAME(__VA_ARGS__) + +template struct memref_t { + T *allocated; + T *aligned; + int64_t offset; + int64_t sizes[Dim]; + int64_t strides[Dim]; +}; + +// ============================================================================ +// 公共包装: 设置 mask → 调用 intrinsic → 恢复 mask +// 所有操作都是连续访问: block_stride=1, repeat_stride=8 +// ============================================================================ + +// 单目操作包装 (vexp) +template +__aiv__ __attribute__((always_inline)) void +unary_op_impl(void (*intrin)(__ubuf__ T *, __ubuf__ T *, uint64_t, uint16_t, + uint16_t, uint16_t, uint16_t), + memref_t<__ubuf__ T, 1> *src, memref_t<__ubuf__ T, 1> *dst) { + auto s = src->aligned + src->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + + // 单目: (dst, src, repeat, dst_bs, src_bs, dst_rs, src_rs) + intrin(d, s, 1, 1, 1, 8, 8); + + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// 双目操作包装 (vdiv, vsub, vmul) +template +__aiv__ __attribute__((always_inline)) void +binary_op_impl(void (*intrin)(__ubuf__ T *, __ubuf__ T *, __ubuf__ T *, + uint64_t, uint16_t, uint16_t, uint16_t, uint16_t, + uint16_t, uint16_t), + memref_t<__ubuf__ T, 1> *src0, memref_t<__ubuf__ T, 1> *src1, + memref_t<__ubuf__ T, 1> *dst) { + auto s0 = src0->aligned + src0->offset; + auto s1 = src1->aligned + src1->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + + // 双目: (dst, src0, src1, repeat, dst_bs, src0_bs, src1_bs, dst_rs, src0_rs, + // src1_rs) + intrin(d, s0, s1, 1, 1, 1, 1, 8, 8, 8); + + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// ============================================================================ +// MLIR 可调用的 C 接口 +// +// Python 侧 symbol 名: +// "custom_vexp_fp32" → C _mlir_ciface_custom_vexp_fp32 +// "custom_vdiv_fp32" → C _mlir_ciface_custom_vdiv_fp32 +// "custom_vsub_fp32" → C _mlir_ciface_custom_vsub_fp32 +// "custom_vmul_fp32" → C _mlir_ciface_custom_vmul_fp32 +// ============================================================================ + +extern "C" { + +// --- vexp: y = exp(x) --- +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_vexp_fp32(memref_t<__ubuf__ float, 1> *src, + memref_t<__ubuf__ float, 1> *dst) { + // vexp(dst, src, repeat, dst_bs, src_bs, dst_rs, src_rs) + auto s = src->aligned + src->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + INTRINSIC(vexp, d, s, 1, 1, 1, 8, 8); + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// --- vdiv: z = x / y --- +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_vdiv_fp32(memref_t<__ubuf__ float, 1> *src0, + memref_t<__ubuf__ float, 1> *src1, + memref_t<__ubuf__ float, 1> *dst) { + auto s0 = src0->aligned + src0->offset; + auto s1 = src1->aligned + src1->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + INTRINSIC(vdiv, d, s0, s1, 1, 1, 1, 1, 8, 8, 8); + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// --- vsub: z = x - y --- +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_vsub_fp32(memref_t<__ubuf__ float, 1> *src0, + memref_t<__ubuf__ float, 1> *src1, + memref_t<__ubuf__ float, 1> *dst) { + auto s0 = src0->aligned + src0->offset; + auto s1 = src1->aligned + src1->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + INTRINSIC(vsub, d, s0, s1, 1, 1, 1, 1, 8, 8, 8); + INTRINSIC_NO_ARGS(set_mask_norm); +} + +// --- vmul: z = x * y --- +__aiv__ __attribute__((always_inline)) void +_mlir_ciface_custom_vmul_fp32(memref_t<__ubuf__ float, 1> *src0, + memref_t<__ubuf__ float, 1> *src1, + memref_t<__ubuf__ float, 1> *dst) { + auto s0 = src0->aligned + src0->offset; + auto s1 = src1->aligned + src1->offset; + auto d = dst->aligned + dst->offset; + int64_t n = dst->sizes[0]; + INTRINSIC_NO_ARGS(set_mask_count); + INTRINSIC(set_vector_mask, 0, n); + INTRINSIC(vmul, d, s0, s1, 1, 1, 1, 1, 8, 8, 8); + INTRINSIC_NO_ARGS(set_mask_norm); +} + +} // extern "C" diff --git a/language/deeplink/bitcode/bc/.gitignore b/language/deeplink/bitcode/bc/.gitignore new file mode 100644 index 00000000..512a2da7 --- /dev/null +++ b/language/deeplink/bitcode/bc/.gitignore @@ -0,0 +1,2 @@ +# Compiled bitcode binaries — generated by compile_bc.sh +*.aiv.bc diff --git a/language/deeplink/bitcode/bc/.gitkeep b/language/deeplink/bitcode/bc/.gitkeep new file mode 100644 index 00000000..a78b62e1 --- /dev/null +++ b/language/deeplink/bitcode/bc/.gitkeep @@ -0,0 +1,3 @@ +# This directory holds compiled bitcode (.aiv.bc) files for dl.custom(). +# Generated by dlcompiler/bitcode/compile_bc.sh from ../src/. +# Installed with the package — no extra setup.py changes needed. diff --git a/language/deeplink/custom_op.py b/language/deeplink/custom_op.py index 09073a1e..53ea97b8 100644 --- a/language/deeplink/custom_op.py +++ b/language/deeplink/custom_op.py @@ -167,9 +167,86 @@ def _add_bitcode_attr(op, builder, attrs): return from pathlib import Path - bitcode = Path(getattr(op, "bitcode")) - assert bitcode.exists(), f"Provided bitcode ({bitcode}) not exist" - attrs["bitcode"] = str(bitcode.absolute()) + bitcode_path = _resolve_bitcode_path(getattr(op, "bitcode")) + attrs["bitcode"] = bitcode_path + + +def _get_bitcode_search_paths(): + """Return list of directories to search for bitcode files, ordered by priority.""" + import os + from pathlib import Path + + paths = [] + + # 1. Environment variable override (colon-separated) + env_path = os.environ.get("DLCOMPILER_BITCODE_PATH") + if env_path: + for p in env_path.split(":"): + p = p.strip() + if p and Path(p).is_dir(): + paths.append(p) + + # 2. Relative to this file: language/deeplink/bitcode/bc/ + # Works both in source tree and after pip install. + local_bc = Path(__file__).parent / "bitcode" / "bc" + if local_bc.is_dir(): + paths.append(str(local_bc)) + + # 3. DLCOMPILER_SOURCE: project source root (set by set_env.sh) + # Used when this module runs from site-packages but bitcode is in source tree. + source_root = os.environ.get("DLCOMPILER_SOURCE") + if source_root: + builtin_bc = Path(source_root) / "language" / "deeplink" / "bitcode" / "bc" + if builtin_bc.is_dir(): + paths.append(str(builtin_bc)) + # Also check legacy dlcompiler/ location + legacy_bc = Path(source_root) / "dlcompiler" / "bitcode" / "bc" + if legacy_bc.is_dir(): + paths.append(str(legacy_bc)) + + # 4. Legacy dsl/ directory for backward compatibility + if source_root: + legacy_dsl = Path(source_root) / "dsl" + if legacy_dsl.is_dir(): + paths.append(str(legacy_dsl)) + + return paths + + +def _resolve_bitcode_path(bitcode_ref): + """Resolve a user-provided bitcode reference to an absolute file path. + + Resolution logic: + 1. If bitcode_ref is an existing file -> return its absolute path + 2. If bitcode_ref is a name -> search for {name}.aiv.bc in search paths + """ + from pathlib import Path + + bitcode = Path(bitcode_ref) + + # Case 1: Already a valid existing file path + if bitcode.is_file(): + return str(bitcode.absolute()) + + # Case 2: Name resolution + if bitcode.suffix == ".bc": + search_name = bitcode.name + else: + search_name = f"{bitcode.name}.aiv.bc" + + for search_dir in _get_bitcode_search_paths(): + candidate = Path(search_dir) / search_name + if candidate.is_file(): + return str(candidate.absolute()) + + searched_dirs = "\n ".join(_get_bitcode_search_paths()) + raise FileNotFoundError( + f"Cannot find bitcode file for '{bitcode_ref}'.\n" + f" Searched for '{search_name}' in:\n" + f" {searched_dirs}\n" + f" Set DLCOMPILER_BITCODE_PATH to add custom search paths, " + f"or provide an absolute path to an existing .aiv.bc file." + ) def _make_attrs(op, builder): diff --git a/test/ascend/test_custom_op.py b/test/ascend/test_custom_op.py index 3c3ee22b..8f411ee3 100644 --- a/test/ascend/test_custom_op.py +++ b/test/ascend/test_custom_op.py @@ -39,9 +39,9 @@ class add: def __init__(self, a, b, out=None): assert out, "out is required" self.symbol = "custom_add_" + str(a.dtype) - # self.bitcode defaults to the Ascend installation directory - # Typically it would be a specific bitcode file like /path/to/kernel.aiv.bc - self.bitcode = "/usr/local/Ascend/" + # bitcode name, auto-resolved via dlcompiler/bitcode/bc/ or + # DLCOMPILER_BITCODE_PATH environment variable + self.bitcode = "add" @triton.jit diff --git a/test/ascend/test_softmax_custom_op.py b/test/ascend/test_softmax_custom_op.py new file mode 100644 index 00000000..f674edf4 --- /dev/null +++ b/test/ascend/test_softmax_custom_op.py @@ -0,0 +1,227 @@ +""" +Triton softmax kernel using dl.custom() with bitcode auto-resolution. + +Computes: softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x))) + +The three vector ops (vsub, vexp, vdiv) are dl.custom() calls that link +against softmax_ops.aiv.bc. The bitcode path is auto-resolved from the +name "softmax_ops" — no absolute path needed. +""" + +import os + +# Ensure bishengir tools are in PATH before triton imports read BISHENG_INSTALL_PATH. +_BISHENG_INSTALL = ( + "/mnt/data01/zmz/workspace/04ttshared/fordlc/ascendnpu-ir-0514/build/install/bin/" +) +if os.path.isdir(_BISHENG_INSTALL): + os.environ.setdefault("BISHENG_INSTALL_PATH", _BISHENG_INSTALL) + if _BISHENG_INSTALL not in os.environ.get("PATH", ""): + os.environ["PATH"] = _BISHENG_INSTALL + os.pathsep + os.environ.get("PATH", "") + +import torch +import triton +import triton.language as tl +import triton.language.extra.deeplink as dl + +# ====================================================================== +# DSL custom op registration — bitcode auto-resolved by name +# ====================================================================== + + +@dl.register_custom_op +class vsub_fp32: + core = dl.CORE.VECTOR + pipe = dl.PIPE.PIPE_V + mode = dl.MODE.SIMD + + def __init__(self, a, b, out=None): + assert out is not None, "dl.custom() requires out= parameter" + self.symbol = "custom_vsub_fp32" + self.bitcode = "softmax_ops" # auto-resolved to softmax_ops.aiv.bc + + +@dl.register_custom_op +class vexp_fp32: + core = dl.CORE.VECTOR + pipe = dl.PIPE.PIPE_V + mode = dl.MODE.SIMD + + def __init__(self, a, out=None): + assert out is not None, "dl.custom() requires out= parameter" + self.symbol = "custom_vexp_fp32" + self.bitcode = "softmax_ops" + + +@dl.register_custom_op +class vdiv_fp32: + core = dl.CORE.VECTOR + pipe = dl.PIPE.PIPE_V + mode = dl.MODE.SIMD + + def __init__(self, a, b, out=None): + assert out is not None, "dl.custom() requires out= parameter" + self.symbol = "custom_vdiv_fp32" + self.bitcode = "softmax_ops" + + +# ====================================================================== +# Triton kernel +# ====================================================================== + +CHUNK_SIZE = 1024 # max vector length for fp32 DSL ops +MIN_CHUNK_SIZE = 8 # min vector length (SIMD width >= 2) + + +@triton.jit +def softmax_kernel( + x_ptr, + output_ptr, + N, + row_stride, + BLOCK_SIZE: tl.constexpr, + CHUNK_SIZE: tl.constexpr, +): + """Row-wise fused softmax using dl.custom() on Ascend NPU. + + One program = one row. Processing in chunks of CHUNK_SIZE. + Reductions (tl.max, tl.sum) use native Triton. + """ + pid = tl.program_id(axis=0) + row_start = pid * row_stride + + # --- Pass 1: row-wise maximum --- + row_max = float("-inf") + for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE): + offsets = row_start + start + tl.arange(0, CHUNK_SIZE) + mask = (start + tl.arange(0, CHUNK_SIZE)) < N + x_chunk = tl.load(x_ptr + offsets, mask=mask, other=float("-inf")) + row_max = tl.maximum(row_max, tl.max(x_chunk)) + + # --- Pass 2: denominator sum(exp(x - max)) --- + denom = 0.0 + for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE): + offsets = row_start + start + tl.arange(0, CHUNK_SIZE) + mask = (start + tl.arange(0, CHUNK_SIZE)) < N + x_chunk = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x_chunk.to(tl.float32) + + zero = tl.full([CHUNK_SIZE], 0, tl.float32) + row_max_vec = zero + row_max + + # vsub: shifted = x - max + buf_sub = tl.full([CHUNK_SIZE], 0, tl.float32) + shifted = dl.custom("vsub_fp32", x_f32, row_max_vec, out=buf_sub) + + # vexp: exp_vals = exp(shifted) + buf_exp = tl.full([CHUNK_SIZE], 0, tl.float32) + exp_vals = dl.custom("vexp_fp32", shifted, out=buf_exp) + + denom += tl.sum(tl.where(mask, exp_vals, 0.0)) + + # --- Pass 3: normalize and store --- + for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE): + offsets = row_start + start + tl.arange(0, CHUNK_SIZE) + mask = (start + tl.arange(0, CHUNK_SIZE)) < N + x_chunk = tl.load(x_ptr + offsets, mask=mask, other=0.0) + x_f32 = x_chunk.to(tl.float32) + + zero = tl.full([CHUNK_SIZE], 0, tl.float32) + row_max_vec = zero + row_max + denom_vec = zero + denom + + # vsub: shifted = x - max + buf_sub = tl.full([CHUNK_SIZE], 0, tl.float32) + shifted = dl.custom("vsub_fp32", x_f32, row_max_vec, out=buf_sub) + + # vexp: exp_vals = exp(shifted) + buf_exp = tl.full([CHUNK_SIZE], 0, tl.float32) + exp_vals = dl.custom("vexp_fp32", shifted, out=buf_exp) + + # vdiv: result = exp_vals / denom + buf_div = tl.full([CHUNK_SIZE], 0, tl.float32) + result = dl.custom("vdiv_fp32", exp_vals, denom_vec, out=buf_div) + + tl.store(output_ptr + offsets, result.to(x_chunk.dtype), mask=mask) + + +# ====================================================================== +# Public wrapper +# ====================================================================== + + +def softmax(x: torch.Tensor, dim: int = -1) -> torch.Tensor: + """Softmax using dl.custom() Triton kernel. + + Args: + x: Input tensor (fp16, fp32, or bf16). + dim: Dimension for softmax (default: -1). + + Returns: + Output tensor with same shape/dtype as x. + """ + ndim = x.ndim + if dim < 0: + dim = ndim + dim + + # Handle non-last-dim by transposition + permuted = False + inv_perm = list(range(ndim)) + if dim != ndim - 1: + perm = list(range(ndim)) + perm[-1], perm[dim] = perm[dim], perm[-1] + x = x.permute(perm).contiguous() + dim = ndim - 1 + permuted = True + inv_perm = [0] * ndim + for i, p in enumerate(perm): + inv_perm[p] = i + else: + x = x.contiguous() + + N = x.shape[-1] + M = x.numel() // N + + BLOCK_SIZE = triton.next_power_of_2(N) + CS = max(MIN_CHUNK_SIZE, min(CHUNK_SIZE, BLOCK_SIZE)) + + output = torch.empty(x.shape, dtype=x.dtype, device=x.device) + grid = (M,) + softmax_kernel[grid]( + x, + output, + N, + N, + BLOCK_SIZE=BLOCK_SIZE, + CHUNK_SIZE=CS, + ) + + if permuted: + output = output.permute(inv_perm) + + return output + + +# ====================================================================== +# Test +# ====================================================================== + +if __name__ == "__main__": + print("=== Softmax Triton Kernel Test (dl.custom) ===") + + for shape, dim in [ + ((4, 128), -1), + ((4, 1024), -1), + ((2, 4096), -1), + ((16, 256), 0), + ]: + x = torch.randn(shape, dtype=torch.float32, device="npu") + y = softmax(x, dim=dim) + ref = torch.nn.functional.softmax(x.float(), dim=dim) + + max_diff = (y - ref).abs().max().item() + passed = torch.allclose(y, ref, atol=1e-5) + status = "PASS" if passed else "FAIL" + print(f" [{status}] shape={shape}, dim={dim}, max_diff={max_diff:.2e}") + + print("Done.")