Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,7 +1430,7 @@ def _format_of(ty):
name.append(kernelName);
{'auto launch_call = [=]()' if enable_taskqueue else ''} {{
uint32_t blockNum = gridX * gridY * gridZ;
{'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl;if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'}
{'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { /* std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl; */ if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'}

{'blockNum = std::min(blockNum, (uint32_t)' + str(num_physical_blocks) + ');' if enable_auto_map_parallel_blocks else ''}
{'cce::internal::DebugTunnelData *DTData = cce::internal::DebugTunnel::Open(blockNum);' if enable_device_print else ''}
Expand Down
14 changes: 14 additions & 0 deletions compile_shared.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,17 @@ else
fi
fi
notify_apply_patch

# ============================================================================
# Compile bitcode libraries for dl.custom() custom ops
# ============================================================================
if command -v ccec &> /dev/null; then
echo "Compiling bitcode libraries for custom ops..."
BITCODE_DIR="$home_path/dlcompiler/bitcode"
if [ -f "$BITCODE_DIR/compile_bc.sh" ]; then
bash "$BITCODE_DIR/compile_bc.sh" || echo "Warning: bitcode compilation failed (non-fatal)"
fi
else
echo "ccec not found, skipping bitcode compilation."
echo " Install CANN toolkit or ensure ccec is in PATH to enable."
fi
171 changes: 171 additions & 0 deletions dlcompiler/bitcode/compile_bc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# ============================================================================
# Compile all src/*.cpp → bc/*.aiv.bc (bitcode library for dl.custom())
#
# 编译器: ccec (Ascend CANN CCE compiler)
# 架构: dav-c220-vec (Ascend 910B2 vector core)
#
# 用法:
# bash compile_bc.sh 编译所有
# bash compile_bc.sh add 只编译 add.cpp
# bash compile_bc.sh softmax 只编译 softmax_ops.cpp
# bash compile_bc.sh -f 强制重新编译全部
# ============================================================================

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SRC_DIR="$SCRIPT_DIR/src"
# Output to language/deeplink/bitcode/bc/ so .bc files are auto-installed with pip
BC_DIR="$SCRIPT_DIR/../../language/deeplink/bitcode/bc"

# ============================================================================
# 1. 检测 CANN 安装路径
# ============================================================================
detect_cann_path() {
local cann_path=""

# 优先级 1: CANN_PATH 环境变量
if [ -n "${CANN_PATH:-}" ] && [ -d "$CANN_PATH" ]; then
cann_path="$CANN_PATH"
# 优先级 2: ASCEND_HOME_PATH 下的 cann 子目录
elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann" ]; then
cann_path="$ASCEND_HOME_PATH/cann"
elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann-9.0.0" ]; then
cann_path="$ASCEND_HOME_PATH/cann-9.0.0"
# 优先级 3: 自动检测 /usr/local/Ascend/cann-*/
else
cann_path=$(ls -d /usr/local/Ascend/cann-*/ 2>/dev/null | head -1 || true)
if [ -n "$cann_path" ]; then
cann_path="${cann_path%/}"
fi
fi

echo "$cann_path"
}

CANN_HOME=$(detect_cann_path)
if [ -z "$CANN_HOME" ]; then
echo "ERROR: Cannot find CANN installation."
echo " Set CANN_PATH or ASCEND_HOME_PATH environment variable,"
echo " or install CANN toolkit under /usr/local/Ascend/"
exit 1
fi
echo "CANN_HOME: $CANN_HOME"

# ============================================================================
# 2. 检测 ccec 编译器
# ============================================================================
CCEC="${CANN_HOME}/bin/ccec"
if [ ! -x "$CCEC" ]; then
echo "ERROR: ccec not found at $CCEC"
exit 1
fi
echo "CCEC: $CCEC"

# ============================================================================
# 3. 编译参数
# ============================================================================
# 架构: dav-c220-vec (Ascend 910B2), dav-c100-vec (Ascend 910B1)
AICORE_ARCH="${DLCOMPILER_AICORE_ARCH:-dav-c220-vec}"

CCEC_FLAGS="-x cce --cce-aicore-arch=${AICORE_ARCH} --cce-aicore-only -c -emit-llvm --std=c++17"

CCEC_INCLUDES="\
-I ${CANN_HOME}/asc \
-I ${CANN_HOME}/aarch64-linux/asc/include/basic_api \
-I ${CANN_HOME}/aarch64-linux/asc/include/interface \
-I ${CANN_HOME}/aarch64-linux/ascendc/include/highlevel_api \
-I ${CANN_HOME}/aarch64-linux/ascendc/include/basic_api/impl \
-I ${CANN_HOME}/aarch64-linux/ascendc/basic_api \
-I ${CANN_HOME}/aarch64-linux/ascendc/basic_api/interface \
-I ${CANN_HOME}/aarch64-linux/ascendc/highlevel_api/lib \
-I ${CANN_HOME}/aarch64-linux/tiling"

echo "AICORE_ARCH: $AICORE_ARCH"
echo "INCLUDES:"
echo "$CCEC_INCLUDES" | tr ' ' '\n' | sed 's/^/ /'

# ============================================================================
# 4. 确保 bc/ 目录存在
# ============================================================================
mkdir -p "$BC_DIR"

# ============================================================================
# 5. 编译函数
# ============================================================================
compile_one() {
local CPP="$1"
local base_name="$(basename "${CPP%.cpp}")"
local BC="$BC_DIR/${base_name}.aiv.bc"

if [ -f "$BC" ] && [ "$FORCE" != "true" ]; then
echo "Bitcode file $BC already exists, skipping."
echo " To recompile: rm -f $BC && bash compile_bc.sh"
return
fi

echo "Compiling $CPP → $BC ..."
${CCEC} ${CCEC_FLAGS} ${CCEC_INCLUDES} "${CPP}" -o "${BC}"

if [ $? -eq 0 ]; then
echo " OK: $BC"
# Show exported custom symbols
if command -v nm &> /dev/null; then
echo " Symbols:"
nm -C "${BC}" 2>/dev/null | grep -i custom | sed 's/^/ /' || \
nm "${BC}" 2>/dev/null | grep -i custom | sed 's/^/ /' || true
fi
else
echo " FAILED: $CPP"
exit 1
fi
}

# ============================================================================
# 6. 解析参数 → 确定编译目标
# ============================================================================
FORCE=false
TARGET=""

for arg in "$@"; do
case "$arg" in
-f|--force)
FORCE=true
;;
*)
TARGET="$arg"
;;
esac
done

cd "$SCRIPT_DIR"

if [ -n "$TARGET" ]; then
case "$TARGET" in
add)
compile_one "$SRC_DIR/add.cpp"
;;
softmax)
compile_one "$SRC_DIR/softmax_ops.cpp"
;;
all)
for CPP in "$SRC_DIR"/*.cpp; do
[ -f "$CPP" ] && compile_one "$CPP"
done
;;
*)
echo "Unknown target: $TARGET"
echo " Options: add, softmax, softmax_full, all"
exit 1
;;
esac
else
# 默认:编译所有
for CPP in "$SRC_DIR"/*.cpp; do
[ -f "$CPP" ] && compile_one "$CPP"
done
fi

echo ""
echo "Done. Bitcode files in: $BC_DIR"
ls -lh "$BC_DIR"/*.aiv.bc 2>/dev/null || echo " (no .aiv.bc files)"
125 changes: 125 additions & 0 deletions dlcompiler/bitcode/src/add.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// ============================================================================
// DSL Custom Op — add (multi-dtype: int32 / fp32 / fp16)
//
// 作用: 在一个 bitcode 中提供三种 dtype 的 vadd C 接口,供 MLIR 调用。
// 编译: bash compile.sh → add.aiv.bc
// 符号:
// _mlir_ciface_custom_add_int32 (对应 dl custom_add_int32)
// _mlir_ciface_custom_add_fp32 (对应 dl custom_add_fp32)
// _mlir_ciface_custom_add_fp16 (对应 dl custom_add_fp16)
//
// 共享位码库: ops/skills/triton/triton-dsl-custom-op/bitcode_lib/
// ============================================================================

#define __aiv__ [aicore]
#define INTRINSIC_NO_ARGS(NAME) NAME()
#define INTRINSIC(NAME, ...) NAME(__VA_ARGS__)

// MLIR memref 结构:表示一块连续内存(指针 + offset + shape + strides)
template <typename T, size_t Dim> struct memref_t {
T *allocated; // 分配基址
T *aligned; // 对齐后的有效起始地址
int64_t offset; // 元素偏移量
int64_t sizes[Dim]; // 各维度长度
int64_t strides[Dim]; // 各维度步长
};

// vadd 指令参数结构
template <size_t OPERANUM, typename SRC_T, typename DST_T = SRC_T>
struct intrin_args {
__ubuf__ DST_T *dst; // 输出指针
__ubuf__ SRC_T *src[OPERANUM]; // 输入指针数组
SRC_T scalar; // 标量值(未使用)
uint64_t repeat; // 重复次数
uint16_t dst_block_stride; // block 内步长
uint16_t src_block_stride[OPERANUM]; // 输入 block 内步长
uint16_t dst_repeat_stride; // repeat 间步长
uint16_t src_repeat_stride[OPERANUM]; // 输入 repeat 间步长
};

// vadd 模板函数:逐元素向量加法
template <typename SRC_TYPE, typename DST_TYPE = SRC_TYPE>
__aiv__ __attribute__((always_inline)) void
vector_eltwise_vadd_intrin(intrin_args<2, SRC_TYPE, DST_TYPE> args) {
#define ELTWISE_VV_ARGS \
args.dst, args.src[0], args.src[1], args.repeat, args.dst_block_stride, \
args.src_block_stride[0], args.src_block_stride[1], \
args.dst_repeat_stride, args.src_repeat_stride[0], \
args.src_repeat_stride[1]

// vadd(dst, src0, src1, repeat, dst_bs, src0_bs, src1_bs, dst_rs, src0_rs,
// src1_rs)
INTRINSIC(vadd, ELTWISE_VV_ARGS);
}

// vadd 调用包装:处理连续访问的公共逻辑
template <typename T>
__aiv__ __attribute__((always_inline)) void
vadd_impl(memref_t<__ubuf__ T, 1> *src0, memref_t<__ubuf__ T, 1> *src1,
memref_t<__ubuf__ T, 1> *dst) {

uint16_t block_stride = 1;
uint16_t repeat_stride = 8;

auto new_src0_ptr = src0->aligned + src0->offset;
auto new_src1_ptr = src1->aligned + src1->offset;
auto dst_ptr = dst->aligned + dst->offset;

// 设置向量掩码(处理边界)
INTRINSIC_NO_ARGS(set_mask_count);
const int64_t n = dst->sizes[0];
INTRINSIC(set_vector_mask, 0, n);

// 调用 vadd 指令
vector_eltwise_vadd_intrin<T>(
intrin_args<2, T>{dst_ptr,
{new_src0_ptr, new_src1_ptr},
0, // scalar (unused)
1, // repeat = 1(单次执行,全部元素由 mask 覆盖)
block_stride,
{block_stride, block_stride},
repeat_stride,
{repeat_stride, repeat_stride}});

// 恢复掩码
INTRINSIC_NO_ARGS(set_mask_norm);
}

// ============================================================================
// MLIR 可调用的 C 接口(三种 dtype)
//
// Python 侧 symbol 命名规则:
// str(tl.int32) → "int32" → symbol = "custom_add_int32"
// str(tl.float32) → "fp32" → symbol = "custom_add_fp32"
// str(tl.float16) → "fp16" → symbol = "custom_add_fp16"
//
// MLIR 自动添加 _mlir_ciface_ 前缀:
// Python "custom_add_int32" → C _mlir_ciface_custom_add_int32
// Python "custom_add_fp32" → C _mlir_ciface_custom_add_fp32
// Python "custom_add_fp16" → C _mlir_ciface_custom_add_fp16
// ============================================================================

extern "C" {

__aiv__ __attribute__((always_inline)) void
_mlir_ciface_custom_add_int32(memref_t<__ubuf__ int32_t, 1> *src0,
memref_t<__ubuf__ int32_t, 1> *src1,
memref_t<__ubuf__ int32_t, 1> *dst) {
vadd_impl(src0, src1, dst);
}

__aiv__ __attribute__((always_inline)) void
_mlir_ciface_custom_add_fp32(memref_t<__ubuf__ float, 1> *src0,
memref_t<__ubuf__ float, 1> *src1,
memref_t<__ubuf__ float, 1> *dst) {
vadd_impl(src0, src1, dst);
}

__aiv__ __attribute__((always_inline)) void
_mlir_ciface_custom_add_fp16(memref_t<__ubuf__ __fp16, 1> *src0,
memref_t<__ubuf__ __fp16, 1> *src1,
memref_t<__ubuf__ __fp16, 1> *dst) {
vadd_impl(src0, src1, dst);
}

} // extern "C"
Loading
Loading