diff --git a/backend/npu.py b/backend/npu.py
index 610cefae..0c89c723 100644
--- a/backend/npu.py
+++ b/backend/npu.py
@@ -1430,7 +1430,7 @@ def _format_of(ty):
   name.append(kernelName);
    {'auto launch_call = [=]()' if enable_taskqueue else ''} {{
     uint32_t blockNum = gridX * gridY * gridZ;
-    {'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl;if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'}
+    {'if (blockNum > (uint32_t)' + str(num_physical_blocks) + ') { /* std::cout << "WARNING: Grid " << blockNum << " > physical limit ' + str(num_physical_blocks) + ', performance maybe reduced." << std::endl; */ if (blockNum > 65535 && !' + str(enable_auto_map_parallel_blocks).lower() + ') {std::cout << "Grid " << blockNum << " > 65535, Please set TRITON_ALL_BLOCKS_PARALLEL=1 to enable all blocks parallel execution." << std::endl; } }'}
 
     {'blockNum = std::min(blockNum, (uint32_t)' + str(num_physical_blocks) + ');' if enable_auto_map_parallel_blocks else ''}
     {'cce::internal::DebugTunnelData *DTData = cce::internal::DebugTunnel::Open(blockNum);' if enable_device_print else ''}
diff --git a/compile_shared.sh b/compile_shared.sh
index d5fdd058..945b9d9e 100644
--- a/compile_shared.sh
+++ b/compile_shared.sh
@@ -117,3 +117,17 @@ else
     fi
 fi
 notify_apply_patch
+
+# ============================================================================
+# Compile bitcode libraries for dl.custom() custom ops
+# ============================================================================
+if command -v ccec &> /dev/null; then
+    echo "Compiling bitcode libraries for custom ops..."
+    BITCODE_DIR="$home_path/dlcompiler/bitcode"
+    if [ -f "$BITCODE_DIR/compile_bc.sh" ]; then
+        bash "$BITCODE_DIR/compile_bc.sh" || echo "Warning: bitcode compilation failed (non-fatal)"
+    fi
+else
+    echo "ccec not found, skipping bitcode compilation."
+    echo "  Install CANN toolkit or ensure ccec is in PATH to enable."
+fi
diff --git a/dlcompiler/bitcode/compile_bc.sh b/dlcompiler/bitcode/compile_bc.sh
new file mode 100755
index 00000000..aa01cc7a
--- /dev/null
+++ b/dlcompiler/bitcode/compile_bc.sh
@@ -0,0 +1,171 @@
+# ============================================================================
+# Compile all src/*.cpp → bc/*.aiv.bc (bitcode library for dl.custom())
+#
+# 编译器: ccec (Ascend CANN CCE compiler)
+# 架构:   dav-c220-vec (Ascend 910B2 vector core)
+#
+# 用法:
+#   bash compile_bc.sh             编译所有
+#   bash compile_bc.sh add         只编译 add.cpp
+#   bash compile_bc.sh softmax     只编译 softmax_ops.cpp
+#   bash compile_bc.sh -f          强制重新编译全部
+# ============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SRC_DIR="$SCRIPT_DIR/src"
+# Output to language/deeplink/bitcode/bc/ so .bc files are auto-installed with pip
+BC_DIR="$SCRIPT_DIR/../../language/deeplink/bitcode/bc"
+
+# ============================================================================
+# 1. 检测 CANN 安装路径
+# ============================================================================
+detect_cann_path() {
+    local cann_path=""
+
+    # 优先级 1: CANN_PATH 环境变量
+    if [ -n "${CANN_PATH:-}" ] && [ -d "$CANN_PATH" ]; then
+        cann_path="$CANN_PATH"
+    # 优先级 2: ASCEND_HOME_PATH 下的 cann 子目录
+    elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann" ]; then
+        cann_path="$ASCEND_HOME_PATH/cann"
+    elif [ -n "${ASCEND_HOME_PATH:-}" ] && [ -d "$ASCEND_HOME_PATH/cann-9.0.0" ]; then
+        cann_path="$ASCEND_HOME_PATH/cann-9.0.0"
+    # 优先级 3: 自动检测 /usr/local/Ascend/cann-*/
+    else
+        cann_path=$(ls -d /usr/local/Ascend/cann-*/ 2>/dev/null | head -1 || true)
+        if [ -n "$cann_path" ]; then
+            cann_path="${cann_path%/}"
+        fi
+    fi
+
+    echo "$cann_path"
+}
+
+CANN_HOME=$(detect_cann_path)
+if [ -z "$CANN_HOME" ]; then
+    echo "ERROR: Cannot find CANN installation."
+    echo "  Set CANN_PATH or ASCEND_HOME_PATH environment variable,"
+    echo "  or install CANN toolkit under /usr/local/Ascend/"
+    exit 1
+fi
+echo "CANN_HOME: $CANN_HOME"
+
+# ============================================================================
+# 2. 检测 ccec 编译器
+# ============================================================================
+CCEC="${CANN_HOME}/bin/ccec"
+if [ ! -x "$CCEC" ]; then
+    echo "ERROR: ccec not found at $CCEC"
+    exit 1
+fi
+echo "CCEC: $CCEC"
+
+# ============================================================================
+# 3. 编译参数
+# ============================================================================
+# 架构: dav-c220-vec (Ascend 910B2), dav-c100-vec (Ascend 910B1)
+AICORE_ARCH="${DLCOMPILER_AICORE_ARCH:-dav-c220-vec}"
+
+CCEC_FLAGS="-x cce --cce-aicore-arch=${AICORE_ARCH} --cce-aicore-only -c -emit-llvm --std=c++17"
+
+CCEC_INCLUDES="\
+    -I ${CANN_HOME}/asc \
+    -I ${CANN_HOME}/aarch64-linux/asc/include/basic_api \
+    -I ${CANN_HOME}/aarch64-linux/asc/include/interface \
+    -I ${CANN_HOME}/aarch64-linux/ascendc/include/highlevel_api \
+    -I ${CANN_HOME}/aarch64-linux/ascendc/include/basic_api/impl \
+    -I ${CANN_HOME}/aarch64-linux/ascendc/basic_api \
+    -I ${CANN_HOME}/aarch64-linux/ascendc/basic_api/interface \
+    -I ${CANN_HOME}/aarch64-linux/ascendc/highlevel_api/lib \
+    -I ${CANN_HOME}/aarch64-linux/tiling"
+
+echo "AICORE_ARCH: $AICORE_ARCH"
+echo "INCLUDES:"
+echo "$CCEC_INCLUDES" | tr ' ' '\n' | sed 's/^/  /'
+
+# ============================================================================
+# 4. 确保 bc/ 目录存在
+# ============================================================================
+mkdir -p "$BC_DIR"
+
+# ============================================================================
+# 5. 编译函数
+# ============================================================================
+compile_one() {
+    local CPP="$1"
+    local base_name="$(basename "${CPP%.cpp}")"
+    local BC="$BC_DIR/${base_name}.aiv.bc"
+
+    if [ -f "$BC" ] && [ "$FORCE" != "true" ]; then
+        echo "Bitcode file $BC already exists, skipping."
+        echo "  To recompile: rm -f $BC && bash compile_bc.sh"
+        return
+    fi
+
+    echo "Compiling $CPP → $BC ..."
+    ${CCEC} ${CCEC_FLAGS} ${CCEC_INCLUDES} "${CPP}" -o "${BC}"
+
+    if [ $? -eq 0 ]; then
+        echo "  OK: $BC"
+        # Show exported custom symbols
+        if command -v nm &> /dev/null; then
+            echo "  Symbols:"
+            nm -C "${BC}" 2>/dev/null | grep -i custom | sed 's/^/    /' || \
+            nm "${BC}" 2>/dev/null | grep -i custom | sed 's/^/    /' || true
+        fi
+    else
+        echo "  FAILED: $CPP"
+        exit 1
+    fi
+}
+
+# ============================================================================
+# 6. 解析参数 → 确定编译目标
+# ============================================================================
+FORCE=false
+TARGET=""
+
+for arg in "$@"; do
+    case "$arg" in
+        -f|--force)
+            FORCE=true
+            ;;
+        *)
+            TARGET="$arg"
+            ;;
+    esac
+done
+
+cd "$SCRIPT_DIR"
+
+if [ -n "$TARGET" ]; then
+    case "$TARGET" in
+        add)
+            compile_one "$SRC_DIR/add.cpp"
+            ;;
+        softmax)
+            compile_one "$SRC_DIR/softmax_ops.cpp"
+            ;;
+        all)
+            for CPP in "$SRC_DIR"/*.cpp; do
+                [ -f "$CPP" ] && compile_one "$CPP"
+            done
+            ;;
+        *)
+            echo "Unknown target: $TARGET"
+            echo "  Options: add, softmax, softmax_full, all"
+            exit 1
+            ;;
+    esac
+else
+    # 默认：编译所有
+    for CPP in "$SRC_DIR"/*.cpp; do
+        [ -f "$CPP" ] && compile_one "$CPP"
+    done
+fi
+
+echo ""
+echo "Done. Bitcode files in: $BC_DIR"
+ls -lh "$BC_DIR"/*.aiv.bc 2>/dev/null || echo "  (no .aiv.bc files)"
diff --git a/dlcompiler/bitcode/src/add.cpp b/dlcompiler/bitcode/src/add.cpp
new file mode 100644
index 00000000..a4159e7c
--- /dev/null
+++ b/dlcompiler/bitcode/src/add.cpp
@@ -0,0 +1,125 @@
+// ============================================================================
+// DSL Custom Op — add (multi-dtype: int32 / fp32 / fp16)
+//
+// 作用: 在一个 bitcode 中提供三种 dtype 的 vadd C 接口，供 MLIR 调用。
+// 编译: bash compile.sh → add.aiv.bc
+// 符号:
+//   _mlir_ciface_custom_add_int32  (对应 dl custom_add_int32)
+//   _mlir_ciface_custom_add_fp32   (对应 dl custom_add_fp32)
+//   _mlir_ciface_custom_add_fp16   (对应 dl custom_add_fp16)
+//
+// 共享位码库: ops/skills/triton/triton-dsl-custom-op/bitcode_lib/
+// ============================================================================
+
+#define __aiv__ [aicore]
+#define INTRINSIC_NO_ARGS(NAME) NAME()
+#define INTRINSIC(NAME, ...) NAME(__VA_ARGS__)
+
+// MLIR memref 结构：表示一块连续内存（指针 + offset + shape + strides）
+template <typename T, size_t Dim> struct memref_t {
+  T *allocated;         // 分配基址
+  T *aligned;           // 对齐后的有效起始地址
+  int64_t offset;       // 元素偏移量
+  int64_t sizes[Dim];   // 各维度长度
+  int64_t strides[Dim]; // 各维度步长
+};
+
+// vadd 指令参数结构
+template <size_t OPERANUM, typename SRC_T, typename DST_T = SRC_T>
+struct intrin_args {
+  __ubuf__ DST_T *dst;                  // 输出指针
+  __ubuf__ SRC_T *src[OPERANUM];        // 输入指针数组
+  SRC_T scalar;                         // 标量值（未使用）
+  uint64_t repeat;                      // 重复次数
+  uint16_t dst_block_stride;            // block 内步长
+  uint16_t src_block_stride[OPERANUM];  // 输入 block 内步长
+  uint16_t dst_repeat_stride;           // repeat 间步长
+  uint16_t src_repeat_stride[OPERANUM]; // 输入 repeat 间步长
+};
+
+// vadd 模板函数：逐元素向量加法
+template <typename SRC_TYPE, typename DST_TYPE = SRC_TYPE>
+__aiv__ __attribute__((always_inline)) void
+vector_eltwise_vadd_intrin(intrin_args<2, SRC_TYPE, DST_TYPE> args) {
+#define ELTWISE_VV_ARGS                                                        \
+  args.dst, args.src[0], args.src[1], args.repeat, args.dst_block_stride,      \
+      args.src_block_stride[0], args.src_block_stride[1],                      \
+      args.dst_repeat_stride, args.src_repeat_stride[0],                       \
+      args.src_repeat_stride[1]
+
+  // vadd(dst, src0, src1, repeat, dst_bs, src0_bs, src1_bs, dst_rs, src0_rs,
+  // src1_rs)
+  INTRINSIC(vadd, ELTWISE_VV_ARGS);
+}
+
+// vadd 调用包装：处理连续访问的公共逻辑
+template <typename T>
+__aiv__ __attribute__((always_inline)) void
+vadd_impl(memref_t<__ubuf__ T, 1> *src0, memref_t<__ubuf__ T, 1> *src1,
+          memref_t<__ubuf__ T, 1> *dst) {
+
+  uint16_t block_stride = 1;
+  uint16_t repeat_stride = 8;
+
+  auto new_src0_ptr = src0->aligned + src0->offset;
+  auto new_src1_ptr = src1->aligned + src1->offset;
+  auto dst_ptr = dst->aligned + dst->offset;
+
+  // 设置向量掩码（处理边界）
+  INTRINSIC_NO_ARGS(set_mask_count);
+  const int64_t n = dst->sizes[0];
+  INTRINSIC(set_vector_mask, 0, n);
+
+  // 调用 vadd 指令
+  vector_eltwise_vadd_intrin<T>(
+      intrin_args<2, T>{dst_ptr,
+                        {new_src0_ptr, new_src1_ptr},
+                        0, // scalar (unused)
+                        1, // repeat = 1（单次执行，全部元素由 mask 覆盖）
+                        block_stride,
+                        {block_stride, block_stride},
+                        repeat_stride,
+                        {repeat_stride, repeat_stride}});
+
+  // 恢复掩码
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// ============================================================================
+// MLIR 可调用的 C 接口（三种 dtype）
+//
+// Python 侧 symbol 命名规则:
+//   str(tl.int32)    → "int32"  → symbol = "custom_add_int32"
+//   str(tl.float32)  → "fp32"   → symbol = "custom_add_fp32"
+//   str(tl.float16)  → "fp16"   → symbol = "custom_add_fp16"
+//
+// MLIR 自动添加 _mlir_ciface_ 前缀:
+//   Python "custom_add_int32" → C _mlir_ciface_custom_add_int32
+//   Python "custom_add_fp32"  → C _mlir_ciface_custom_add_fp32
+//   Python "custom_add_fp16"  → C _mlir_ciface_custom_add_fp16
+// ============================================================================
+
+extern "C" {
+
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_add_int32(memref_t<__ubuf__ int32_t, 1> *src0,
+                              memref_t<__ubuf__ int32_t, 1> *src1,
+                              memref_t<__ubuf__ int32_t, 1> *dst) {
+  vadd_impl(src0, src1, dst);
+}
+
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_add_fp32(memref_t<__ubuf__ float, 1> *src0,
+                             memref_t<__ubuf__ float, 1> *src1,
+                             memref_t<__ubuf__ float, 1> *dst) {
+  vadd_impl(src0, src1, dst);
+}
+
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_add_fp16(memref_t<__ubuf__ __fp16, 1> *src0,
+                             memref_t<__ubuf__ __fp16, 1> *src1,
+                             memref_t<__ubuf__ __fp16, 1> *dst) {
+  vadd_impl(src0, src1, dst);
+}
+
+} // extern "C"
diff --git a/dlcompiler/bitcode/src/softmax_ops.cpp b/dlcompiler/bitcode/src/softmax_ops.cpp
new file mode 100644
index 00000000..248f47a4
--- /dev/null
+++ b/dlcompiler/bitcode/src/softmax_ops.cpp
@@ -0,0 +1,150 @@
+// ============================================================================
+// DSL Custom Op — softmax 相关运算 (fp32)
+//
+// 包含用于 FlashAttention 的 vector 操作:
+//   vexp:  y = exp(x)              单目
+//   vdiv:  z = x / y               双目
+//   vsub:  z = x - y               双目
+//   vmul:  z = x * y               双目
+//
+// 编译: bash compile.sh → softmax_ops.aiv.bc
+// 符号:
+//   _mlir_ciface_custom_vexp_fp32
+//   _mlir_ciface_custom_vdiv_fp32
+//   _mlir_ciface_custom_vsub_fp32
+//   _mlir_ciface_custom_vmul_fp32
+//
+// 共享位码库: ops/skills/triton/triton-dsl-custom-op/bitcode_lib/
+// ============================================================================
+
+#define __aiv__ [aicore]
+#define INTRINSIC_NO_ARGS(NAME) NAME()
+#define INTRINSIC(NAME, ...) NAME(__VA_ARGS__)
+
+template <typename T, size_t Dim> struct memref_t {
+  T *allocated;
+  T *aligned;
+  int64_t offset;
+  int64_t sizes[Dim];
+  int64_t strides[Dim];
+};
+
+// ============================================================================
+// 公共包装: 设置 mask → 调用 intrinsic → 恢复 mask
+// 所有操作都是连续访问: block_stride=1, repeat_stride=8
+// ============================================================================
+
+// 单目操作包装 (vexp)
+template <typename T>
+__aiv__ __attribute__((always_inline)) void
+unary_op_impl(void (*intrin)(__ubuf__ T *, __ubuf__ T *, uint64_t, uint16_t,
+                             uint16_t, uint16_t, uint16_t),
+              memref_t<__ubuf__ T, 1> *src, memref_t<__ubuf__ T, 1> *dst) {
+  auto s = src->aligned + src->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+
+  // 单目: (dst, src, repeat, dst_bs, src_bs, dst_rs, src_rs)
+  intrin(d, s, 1, 1, 1, 8, 8);
+
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// 双目操作包装 (vdiv, vsub, vmul)
+template <typename T>
+__aiv__ __attribute__((always_inline)) void
+binary_op_impl(void (*intrin)(__ubuf__ T *, __ubuf__ T *, __ubuf__ T *,
+                              uint64_t, uint16_t, uint16_t, uint16_t, uint16_t,
+                              uint16_t, uint16_t),
+               memref_t<__ubuf__ T, 1> *src0, memref_t<__ubuf__ T, 1> *src1,
+               memref_t<__ubuf__ T, 1> *dst) {
+  auto s0 = src0->aligned + src0->offset;
+  auto s1 = src1->aligned + src1->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+
+  // 双目: (dst, src0, src1, repeat, dst_bs, src0_bs, src1_bs, dst_rs, src0_rs,
+  // src1_rs)
+  intrin(d, s0, s1, 1, 1, 1, 1, 8, 8, 8);
+
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// ============================================================================
+// MLIR 可调用的 C 接口
+//
+// Python 侧 symbol 名:
+//   "custom_vexp_fp32"  → C _mlir_ciface_custom_vexp_fp32
+//   "custom_vdiv_fp32"  → C _mlir_ciface_custom_vdiv_fp32
+//   "custom_vsub_fp32"  → C _mlir_ciface_custom_vsub_fp32
+//   "custom_vmul_fp32"  → C _mlir_ciface_custom_vmul_fp32
+// ============================================================================
+
+extern "C" {
+
+// --- vexp: y = exp(x) ---
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_vexp_fp32(memref_t<__ubuf__ float, 1> *src,
+                              memref_t<__ubuf__ float, 1> *dst) {
+  // vexp(dst, src, repeat, dst_bs, src_bs, dst_rs, src_rs)
+  auto s = src->aligned + src->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+  INTRINSIC(vexp, d, s, 1, 1, 1, 8, 8);
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// --- vdiv: z = x / y ---
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_vdiv_fp32(memref_t<__ubuf__ float, 1> *src0,
+                              memref_t<__ubuf__ float, 1> *src1,
+                              memref_t<__ubuf__ float, 1> *dst) {
+  auto s0 = src0->aligned + src0->offset;
+  auto s1 = src1->aligned + src1->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+  INTRINSIC(vdiv, d, s0, s1, 1, 1, 1, 1, 8, 8, 8);
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// --- vsub: z = x - y ---
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_vsub_fp32(memref_t<__ubuf__ float, 1> *src0,
+                              memref_t<__ubuf__ float, 1> *src1,
+                              memref_t<__ubuf__ float, 1> *dst) {
+  auto s0 = src0->aligned + src0->offset;
+  auto s1 = src1->aligned + src1->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+  INTRINSIC(vsub, d, s0, s1, 1, 1, 1, 1, 8, 8, 8);
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+// --- vmul: z = x * y ---
+__aiv__ __attribute__((always_inline)) void
+_mlir_ciface_custom_vmul_fp32(memref_t<__ubuf__ float, 1> *src0,
+                              memref_t<__ubuf__ float, 1> *src1,
+                              memref_t<__ubuf__ float, 1> *dst) {
+  auto s0 = src0->aligned + src0->offset;
+  auto s1 = src1->aligned + src1->offset;
+  auto d = dst->aligned + dst->offset;
+  int64_t n = dst->sizes[0];
+  INTRINSIC_NO_ARGS(set_mask_count);
+  INTRINSIC(set_vector_mask, 0, n);
+  INTRINSIC(vmul, d, s0, s1, 1, 1, 1, 1, 8, 8, 8);
+  INTRINSIC_NO_ARGS(set_mask_norm);
+}
+
+} // extern "C"
diff --git a/language/deeplink/bitcode/bc/.gitignore b/language/deeplink/bitcode/bc/.gitignore
new file mode 100644
index 00000000..512a2da7
--- /dev/null
+++ b/language/deeplink/bitcode/bc/.gitignore
@@ -0,0 +1,2 @@
+# Compiled bitcode binaries — generated by compile_bc.sh
+*.aiv.bc
diff --git a/language/deeplink/bitcode/bc/.gitkeep b/language/deeplink/bitcode/bc/.gitkeep
new file mode 100644
index 00000000..a78b62e1
--- /dev/null
+++ b/language/deeplink/bitcode/bc/.gitkeep
@@ -0,0 +1,3 @@
+# This directory holds compiled bitcode (.aiv.bc) files for dl.custom().
+# Generated by dlcompiler/bitcode/compile_bc.sh from ../src/.
+# Installed with the package — no extra setup.py changes needed.
diff --git a/language/deeplink/custom_op.py b/language/deeplink/custom_op.py
index 09073a1e..53ea97b8 100644
--- a/language/deeplink/custom_op.py
+++ b/language/deeplink/custom_op.py
@@ -167,9 +167,86 @@ def _add_bitcode_attr(op, builder, attrs):
         return
     from pathlib import Path
 
-    bitcode = Path(getattr(op, "bitcode"))
-    assert bitcode.exists(), f"Provided bitcode ({bitcode}) not exist"
-    attrs["bitcode"] = str(bitcode.absolute())
+    bitcode_path = _resolve_bitcode_path(getattr(op, "bitcode"))
+    attrs["bitcode"] = bitcode_path
+
+
+def _get_bitcode_search_paths():
+    """Return list of directories to search for bitcode files, ordered by priority."""
+    import os
+    from pathlib import Path
+
+    paths = []
+
+    # 1. Environment variable override (colon-separated)
+    env_path = os.environ.get("DLCOMPILER_BITCODE_PATH")
+    if env_path:
+        for p in env_path.split(":"):
+            p = p.strip()
+            if p and Path(p).is_dir():
+                paths.append(p)
+
+    # 2. Relative to this file: language/deeplink/bitcode/bc/
+    #    Works both in source tree and after pip install.
+    local_bc = Path(__file__).parent / "bitcode" / "bc"
+    if local_bc.is_dir():
+        paths.append(str(local_bc))
+
+    # 3. DLCOMPILER_SOURCE: project source root (set by set_env.sh)
+    #    Used when this module runs from site-packages but bitcode is in source tree.
+    source_root = os.environ.get("DLCOMPILER_SOURCE")
+    if source_root:
+        builtin_bc = Path(source_root) / "language" / "deeplink" / "bitcode" / "bc"
+        if builtin_bc.is_dir():
+            paths.append(str(builtin_bc))
+        # Also check legacy dlcompiler/ location
+        legacy_bc = Path(source_root) / "dlcompiler" / "bitcode" / "bc"
+        if legacy_bc.is_dir():
+            paths.append(str(legacy_bc))
+
+    # 4. Legacy dsl/ directory for backward compatibility
+    if source_root:
+        legacy_dsl = Path(source_root) / "dsl"
+        if legacy_dsl.is_dir():
+            paths.append(str(legacy_dsl))
+
+    return paths
+
+
+def _resolve_bitcode_path(bitcode_ref):
+    """Resolve a user-provided bitcode reference to an absolute file path.
+
+    Resolution logic:
+      1. If bitcode_ref is an existing file -> return its absolute path
+      2. If bitcode_ref is a name -> search for {name}.aiv.bc in search paths
+    """
+    from pathlib import Path
+
+    bitcode = Path(bitcode_ref)
+
+    # Case 1: Already a valid existing file path
+    if bitcode.is_file():
+        return str(bitcode.absolute())
+
+    # Case 2: Name resolution
+    if bitcode.suffix == ".bc":
+        search_name = bitcode.name
+    else:
+        search_name = f"{bitcode.name}.aiv.bc"
+
+    for search_dir in _get_bitcode_search_paths():
+        candidate = Path(search_dir) / search_name
+        if candidate.is_file():
+            return str(candidate.absolute())
+
+    searched_dirs = "\n  ".join(_get_bitcode_search_paths())
+    raise FileNotFoundError(
+        f"Cannot find bitcode file for '{bitcode_ref}'.\n"
+        f"  Searched for '{search_name}' in:\n"
+        f"  {searched_dirs}\n"
+        f"  Set DLCOMPILER_BITCODE_PATH to add custom search paths, "
+        f"or provide an absolute path to an existing .aiv.bc file."
+    )
 
 
 def _make_attrs(op, builder):
diff --git a/test/ascend/test_custom_op.py b/test/ascend/test_custom_op.py
index 3c3ee22b..8f411ee3 100644
--- a/test/ascend/test_custom_op.py
+++ b/test/ascend/test_custom_op.py
@@ -39,9 +39,9 @@ class add:
     def __init__(self, a, b, out=None):
         assert out, "out is required"
         self.symbol = "custom_add_" + str(a.dtype)
-        # self.bitcode defaults to the Ascend installation directory
-        # Typically it would be a specific bitcode file like /path/to/kernel.aiv.bc
-        self.bitcode = "/usr/local/Ascend/"
+        # bitcode name, auto-resolved via dlcompiler/bitcode/bc/ or
+        # DLCOMPILER_BITCODE_PATH environment variable
+        self.bitcode = "add"
 
 
 @triton.jit
diff --git a/test/ascend/test_softmax_custom_op.py b/test/ascend/test_softmax_custom_op.py
new file mode 100644
index 00000000..f674edf4
--- /dev/null
+++ b/test/ascend/test_softmax_custom_op.py
@@ -0,0 +1,227 @@
+"""
+Triton softmax kernel using dl.custom() with bitcode auto-resolution.
+
+Computes: softmax(x_i) = exp(x_i - max(x)) / sum(exp(x_j - max(x)))
+
+The three vector ops (vsub, vexp, vdiv) are dl.custom() calls that link
+against softmax_ops.aiv.bc.  The bitcode path is auto-resolved from the
+name "softmax_ops" — no absolute path needed.
+"""
+
+import os
+
+# Ensure bishengir tools are in PATH before triton imports read BISHENG_INSTALL_PATH.
+_BISHENG_INSTALL = (
+    "/mnt/data01/zmz/workspace/04ttshared/fordlc/ascendnpu-ir-0514/build/install/bin/"
+)
+if os.path.isdir(_BISHENG_INSTALL):
+    os.environ.setdefault("BISHENG_INSTALL_PATH", _BISHENG_INSTALL)
+    if _BISHENG_INSTALL not in os.environ.get("PATH", ""):
+        os.environ["PATH"] = _BISHENG_INSTALL + os.pathsep + os.environ.get("PATH", "")
+
+import torch
+import triton
+import triton.language as tl
+import triton.language.extra.deeplink as dl
+
+# ======================================================================
+# DSL custom op registration — bitcode auto-resolved by name
+# ======================================================================
+
+
+@dl.register_custom_op
+class vsub_fp32:
+    core = dl.CORE.VECTOR
+    pipe = dl.PIPE.PIPE_V
+    mode = dl.MODE.SIMD
+
+    def __init__(self, a, b, out=None):
+        assert out is not None, "dl.custom() requires out= parameter"
+        self.symbol = "custom_vsub_fp32"
+        self.bitcode = "softmax_ops"  # auto-resolved to softmax_ops.aiv.bc
+
+
+@dl.register_custom_op
+class vexp_fp32:
+    core = dl.CORE.VECTOR
+    pipe = dl.PIPE.PIPE_V
+    mode = dl.MODE.SIMD
+
+    def __init__(self, a, out=None):
+        assert out is not None, "dl.custom() requires out= parameter"
+        self.symbol = "custom_vexp_fp32"
+        self.bitcode = "softmax_ops"
+
+
+@dl.register_custom_op
+class vdiv_fp32:
+    core = dl.CORE.VECTOR
+    pipe = dl.PIPE.PIPE_V
+    mode = dl.MODE.SIMD
+
+    def __init__(self, a, b, out=None):
+        assert out is not None, "dl.custom() requires out= parameter"
+        self.symbol = "custom_vdiv_fp32"
+        self.bitcode = "softmax_ops"
+
+
+# ======================================================================
+# Triton kernel
+# ======================================================================
+
+CHUNK_SIZE = 1024  # max vector length for fp32 DSL ops
+MIN_CHUNK_SIZE = 8  # min vector length (SIMD width >= 2)
+
+
+@triton.jit
+def softmax_kernel(
+    x_ptr,
+    output_ptr,
+    N,
+    row_stride,
+    BLOCK_SIZE: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+):
+    """Row-wise fused softmax using dl.custom() on Ascend NPU.
+
+    One program = one row.  Processing in chunks of CHUNK_SIZE.
+    Reductions (tl.max, tl.sum) use native Triton.
+    """
+    pid = tl.program_id(axis=0)
+    row_start = pid * row_stride
+
+    # --- Pass 1: row-wise maximum ---
+    row_max = float("-inf")
+    for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE):
+        offsets = row_start + start + tl.arange(0, CHUNK_SIZE)
+        mask = (start + tl.arange(0, CHUNK_SIZE)) < N
+        x_chunk = tl.load(x_ptr + offsets, mask=mask, other=float("-inf"))
+        row_max = tl.maximum(row_max, tl.max(x_chunk))
+
+    # --- Pass 2: denominator sum(exp(x - max)) ---
+    denom = 0.0
+    for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE):
+        offsets = row_start + start + tl.arange(0, CHUNK_SIZE)
+        mask = (start + tl.arange(0, CHUNK_SIZE)) < N
+        x_chunk = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+        x_f32 = x_chunk.to(tl.float32)
+
+        zero = tl.full([CHUNK_SIZE], 0, tl.float32)
+        row_max_vec = zero + row_max
+
+        # vsub: shifted = x - max
+        buf_sub = tl.full([CHUNK_SIZE], 0, tl.float32)
+        shifted = dl.custom("vsub_fp32", x_f32, row_max_vec, out=buf_sub)
+
+        # vexp: exp_vals = exp(shifted)
+        buf_exp = tl.full([CHUNK_SIZE], 0, tl.float32)
+        exp_vals = dl.custom("vexp_fp32", shifted, out=buf_exp)
+
+        denom += tl.sum(tl.where(mask, exp_vals, 0.0))
+
+    # --- Pass 3: normalize and store ---
+    for start in tl.static_range(0, BLOCK_SIZE, CHUNK_SIZE):
+        offsets = row_start + start + tl.arange(0, CHUNK_SIZE)
+        mask = (start + tl.arange(0, CHUNK_SIZE)) < N
+        x_chunk = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+        x_f32 = x_chunk.to(tl.float32)
+
+        zero = tl.full([CHUNK_SIZE], 0, tl.float32)
+        row_max_vec = zero + row_max
+        denom_vec = zero + denom
+
+        # vsub: shifted = x - max
+        buf_sub = tl.full([CHUNK_SIZE], 0, tl.float32)
+        shifted = dl.custom("vsub_fp32", x_f32, row_max_vec, out=buf_sub)
+
+        # vexp: exp_vals = exp(shifted)
+        buf_exp = tl.full([CHUNK_SIZE], 0, tl.float32)
+        exp_vals = dl.custom("vexp_fp32", shifted, out=buf_exp)
+
+        # vdiv: result = exp_vals / denom
+        buf_div = tl.full([CHUNK_SIZE], 0, tl.float32)
+        result = dl.custom("vdiv_fp32", exp_vals, denom_vec, out=buf_div)
+
+        tl.store(output_ptr + offsets, result.to(x_chunk.dtype), mask=mask)
+
+
+# ======================================================================
+# Public wrapper
+# ======================================================================
+
+
+def softmax(x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """Softmax using dl.custom() Triton kernel.
+
+    Args:
+        x: Input tensor (fp16, fp32, or bf16).
+        dim: Dimension for softmax (default: -1).
+
+    Returns:
+        Output tensor with same shape/dtype as x.
+    """
+    ndim = x.ndim
+    if dim < 0:
+        dim = ndim + dim
+
+    # Handle non-last-dim by transposition
+    permuted = False
+    inv_perm = list(range(ndim))
+    if dim != ndim - 1:
+        perm = list(range(ndim))
+        perm[-1], perm[dim] = perm[dim], perm[-1]
+        x = x.permute(perm).contiguous()
+        dim = ndim - 1
+        permuted = True
+        inv_perm = [0] * ndim
+        for i, p in enumerate(perm):
+            inv_perm[p] = i
+    else:
+        x = x.contiguous()
+
+    N = x.shape[-1]
+    M = x.numel() // N
+
+    BLOCK_SIZE = triton.next_power_of_2(N)
+    CS = max(MIN_CHUNK_SIZE, min(CHUNK_SIZE, BLOCK_SIZE))
+
+    output = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+    grid = (M,)
+    softmax_kernel[grid](
+        x,
+        output,
+        N,
+        N,
+        BLOCK_SIZE=BLOCK_SIZE,
+        CHUNK_SIZE=CS,
+    )
+
+    if permuted:
+        output = output.permute(inv_perm)
+
+    return output
+
+
+# ======================================================================
+# Test
+# ======================================================================
+
+if __name__ == "__main__":
+    print("=== Softmax Triton Kernel Test (dl.custom) ===")
+
+    for shape, dim in [
+        ((4, 128), -1),
+        ((4, 1024), -1),
+        ((2, 4096), -1),
+        ((16, 256), 0),
+    ]:
+        x = torch.randn(shape, dtype=torch.float32, device="npu")
+        y = softmax(x, dim=dim)
+        ref = torch.nn.functional.softmax(x.float(), dim=dim)
+
+        max_diff = (y - ref).abs().max().item()
+        passed = torch.allclose(y, ref, atol=1e-5)
+        status = "PASS" if passed else "FAIL"
+        print(f"  [{status}] shape={shape}, dim={dim}, max_diff={max_diff:.2e}")
+
+    print("Done.")