diff --git a/build/cam/build.sh b/build/cam/build.sh
old mode 100644
new mode 100755
diff --git a/build/cam/comm_operator/build.sh b/build/cam/comm_operator/build.sh
old mode 100644
new mode 100755
diff --git a/build/cam/comm_operator/build_pybind.sh b/build/cam/comm_operator/build_pybind.sh
index c0c9593d78b38b89b8f79445f0afb4ce1881c850..8ef99e3fb4a62f01ead9a5304705a90a68de0297 100644
--- a/build/cam/comm_operator/build_pybind.sh
+++ b/build/cam/comm_operator/build_pybind.sh
@@ -7,7 +7,7 @@
 # History: 2025-12-09 create pybind building script
 
 set -e
-EXT_PATH=$MODULE_BUILD_PATH/pybind/
+EXT_PATH=$MODULE_BUILD_PATH/pybind
 DIST_OUT_PATH=$MODULE_BUILD_OUT_PATH
 
 if [ ! -d "$MODULE_BUILD_PATH" ]; then
@@ -22,7 +22,7 @@ build_pybind() {
     cp -rf $MODULE_SRC_PATH/pybind/pytorch_extension $BUILD_PATH
     cd $EXT_PATH
     python3 setup.py bdist_wheel
-    DIST_GEN_PATH=$EXT_PATH/dist/
+    DIST_GEN_PATH=$EXT_PATH/dist
     if [ -d "$DIST_GEN_PATH" ]; then
         echo "copy $DIST_GEN_PATH to $DIST_OUT_PATH/"
         cp -rf $DIST_GEN_PATH $DIST_OUT_PATH
diff --git a/src/cam/comm_operator/pybind/functions.h b/src/cam/comm_operator/pybind/functions.h
index 305a8f49b5a612ca6036f6be835311cf9c515fcc..8f583349f9d1b851242ff3047aec8d22e22d69ae 100644
--- a/src/cam/comm_operator/pybind/functions.h
+++ b/src/cam/comm_operator/pybind/functions.h
@@ -33,4 +33,38 @@ at::Tensor fused_deep_moe_impl_autograd(
     int64_t shareExpertRankNum, \
     int64_t quantMode, \
     int64_t globalBs);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+moe_dispatch_normal_impl_autograd(
+    const at::Tensor &x, \
+    const at::Tensor &topkIdx, \
+    const at::Tensor &sendOffset, \
+    const at::Tensor &sendTokenIdx, \
+    const at::Tensor &recvOffset, \
+    const at::Tensor &recvCount, \
+    c10::string_view groupEp, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view groupTp, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t quantMode, \
+    int64_t globalBs);
+
+at::Tensor
+moe_combine_normal_impl_autograd(
+    const at::Tensor &recvX, \
+    const at::Tensor &tokenSrcInfo, \
+    const at::Tensor &epRecvCounts, \
+    const at::Tensor &recvTopkWeights, \
+    const std::optional<at::Tensor> &tpRecvCounts, \
+    c10::string_view epGroupName, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view tpGroupName, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t globalBs);
 #endif // COMMON_OPS_CSRC_FUNCTIONS_H_
\ No newline at end of file
diff --git a/src/cam/comm_operator/pybind/fused_deep_moe.cpp b/src/cam/comm_operator/pybind/fused_deep_moe.cpp
index b1103453b8036f987279e390da3b7f892145345d..8ea3acb61ecf4b1601bad15fe4b6f08e08ed88bc 100644
--- a/src/cam/comm_operator/pybind/fused_deep_moe.cpp
+++ b/src/cam/comm_operator/pybind/fused_deep_moe.cpp
@@ -65,7 +65,7 @@ at::Tensor fused_deep_moe_impl_npu(
     return output;
 }
 
-std::tuple<at::Tensor, at::Tensor> fused_deep_moe_backward_impl_npu(const at::Tensor &self)
+tensor_list fused_deep_moe_backward_impl_npu(const at::Tensor &self)
 {
     at::Tensor result = at::Tensor(self); // 创建输出内存
     return {result, result};
diff --git a/src/cam/comm_operator/pybind/moe_combine_normal.cpp b/src/cam/comm_operator/pybind/moe_combine_normal.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..570e9e0c4d6c161deaf0efe13cd41e2125c69671
--- /dev/null
+++ b/src/cam/comm_operator/pybind/moe_combine_normal.cpp
@@ -0,0 +1,184 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * Description: add moe_combine_normal pybind extention file
+ * Create: 2025-12-10
+ * Note:
+ * History: 2025-12-10 create moe_combine_normal pybind extention file
+ */
+
+#include <unistd.h>
+#include <hccl/hccl.h>
+#include <torch/extension.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "pytorch_npu_helper.hpp"
+#include <hccl/hccl.h>
+#include <iostream>
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+using namespace std;
+
+constexpr int KERNEL_PARAM_CNT = 3;
+
+at::Tensor moe_combine_normal_impl_npu(
+    const at::Tensor &recvX, \
+    const at::Tensor &tokenSrcInfo, \
+    const at::Tensor &epRecvCounts, \
+    const at::Tensor &recvTopkWeights, \
+    const c10::optional<at::Tensor> &tpRecvCounts, \
+    c10::string_view epGroupName, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view tpGroupName, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t globalBs)
+{
+    std::vector<at::Tensor> weightBlocks;
+    if (recvTopkWeights.size(0) != 0) {
+        weightBlocks.emplace_back(recvTopkWeights);
+    }
+    at::Tensor expertScales = torch::cat(weightBlocks, 0);
+
+    // Combine data
+    auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options());
+
+    EXEC_NPU_CMD(aclnnMoeCombineNormal,
+        // input
+        recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \
+        // attr
+        epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs, \
+        // output
+        combinedX);
+    return combinedX;
+}
+
+tensor_list moe_combine_normal_backward_impl_npu(const at::Tensor &self)
+{
+    return {at::Tensor(), at::Tensor(), at::Tensor()};
+}
+
+at::Tensor moe_combine_normal_impl_meta(
+    const at::Tensor &recvX, \
+    const at::Tensor &tokenSrcInfo, \
+    const at::Tensor &epRecvCounts, \
+    const at::Tensor &recvTopkWeights, \
+    const c10::optional<at::Tensor> &tpRecvCounts, \
+    c10::string_view epGroupName, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view tpGroupName, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t globalBs)
+{
+    std::vector<at::Tensor> weightBlocks;
+    if (recvTopkWeights.size(0) != 0) {
+        weightBlocks.emplace_back(recvTopkWeights);
+    }
+    at::Tensor expertScales = torch::cat(weightBlocks, 0);
+
+    // Combine data
+    auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options());
+
+    return combinedX;
+}
+
+at::Tensor moe_combine_normal_impl(
+    const at::Tensor &recvX, \
+    const at::Tensor &tokenSrcInfo, \
+    const at::Tensor &epRecvCounts, \
+    const at::Tensor &recvTopkWeights, \
+    const c10::optional<at::Tensor> &tpRecvCounts, \
+    c10::string_view epGroupName, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view tpGroupName, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t globalBs)
+{
+    static auto op = torch::Dispatcher::singleton()
+                        .findSchemaOrThrow("umdk_cam_op_lib::moe_combine_normal", "")
+                        .typed<decltype(moe_combine_normal_impl)>();
+    return op.call(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \
+        epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs);
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class ExtMoeCombineNormal : public torch::autograd::Function<ExtMoeCombineNormal> {
+public:
+    static at::Tensor forward(
+        AutogradContext *ctx, \
+        const at::Tensor &recvX, \
+        const at::Tensor &tokenSrcInfo, \
+        const at::Tensor &epRecvCounts, \
+        const at::Tensor &recvTopkWeights, \
+        const c10::optional<at::Tensor> &tpRecvCounts, \
+        c10::string_view epGroupName, \
+        int64_t epWorldSize, \
+        int64_t epRankId, \
+        c10::string_view tpGroupName, \
+        int64_t tpWorldSize, \
+        int64_t tpRankId, \
+        int64_t moeExpertNum, \
+        int64_t globalBs)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        auto result = moe_combine_normal_impl(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \
+        epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs);
+        return result;
+    }
+
+    static tensor_list backward(
+        AutogradContext *ctx, \
+        tensor_list grad_outputs)
+    {
+        return {at::Tensor(), at::Tensor(), at::Tensor()};
+    }
+};
+
+at::Tensor moe_combine_normal_impl_autograd(
+    const at::Tensor &recvX, \
+    const at::Tensor &tokenSrcInfo, \
+    const at::Tensor &epRecvCounts, \
+    const at::Tensor &recvTopkWeights, \
+    const c10::optional<at::Tensor> &tpRecvCounts, \
+    c10::string_view epGroupName, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view tpGroupName, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t globalBs)
+{
+    auto result = ExtMoeCombineNormal::apply(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \
+        epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs);
+    return result;
+}
+
+// moe_dispatch_normal
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m)
+{
+    m.impl("moe_combine_normal", &moe_combine_normal_impl_npu);
+    m.impl("moe_combine_normal_backward", &moe_combine_normal_backward_impl_npu);
+}
+
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m)
+{
+    m.impl("moe_combine_normal", &moe_combine_normal_impl_autograd);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m)
+{
+    m.impl("moe_combine_normal", &moe_combine_normal_impl_meta);
+}
\ No newline at end of file
diff --git a/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31a2987f28fa388d4926e940b2e825e43dea7d2c
--- /dev/null
+++ b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp
@@ -0,0 +1,215 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * Description: add moe_dispatch_normal pybind extention file
+ * Create: 2025-12-10
+ * Note:
+ * History: 2025-12-10 create moe_dispatch_normal pybind extention file
+ */
+
+#include <unistd.h>
+#include <hccl/hccl.h>
+#include <torch/extension.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "pytorch_npu_helper.hpp"
+#include <hccl/hccl.h>
+#include <iostream>
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+using namespace std;
+
+constexpr int KERNEL_PARAM_CNT = 3;
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> moe_dispatch_normal_impl_npu(
+    const at::Tensor &x, \
+    const at::Tensor &topkIdx, \
+    const at::Tensor &sendOffset, \
+    const at::Tensor &sendTokenIdx, \
+    const at::Tensor &recvOffset, \
+    const at::Tensor &recvCount, \
+    c10::string_view groupEp, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view groupTp, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t quantMode, \
+    int64_t globalBs
+)
+{
+    // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr， 然后output
+    vector<char> groupEpChrs(groupEp.begin(), groupEp.end());
+    groupEpChrs.push_back('\0');
+    char *groupEpPtr = &groupEpChrs[0];
+    vector<char> groupTpChrs(groupTp.begin(), groupTp.end());
+    groupTpChrs.push_back('\0');
+    char *groupTpPtr = &groupTpChrs[0];
+
+    auto recvCountCpu = recvCount.to(at::kCPU);
+    auto recvCountPtr = recvCountCpu.data_ptr<int>();
+    auto hidden = static_cast<int>(x.size(1));
+    int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1];
+    int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens;
+    auto expandxOut = at::zeros({totalCnt, hidden}, x.options());
+    auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device()));
+    auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device()));
+
+    EXEC_NPU_CMD(aclnnMoeDispatchNormal,
+        // input
+        x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, \
+        // attr
+        groupEpPtr, epWorldSize, epRankId, groupTpPtr, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs, \
+        // output
+        expandxOut, dynamicScalesOut, expandIdxOut);
+    return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut);
+}
+
+tensor_list moe_dispatch_normal_backward_impl_npu(const at::Tensor &self)
+{
+    at::Tensor result = at::Tensor(self); // 创建输出内存
+    return {result, result, result};
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> moe_dispatch_normal_impl_meta(
+    const at::Tensor &x, \
+    const at::Tensor &topkIdx, \
+    const at::Tensor &sendOffset, \
+    const at::Tensor &sendTokenIdx, \
+    const at::Tensor &recvOffset, \
+    const at::Tensor &recvCount, \
+    c10::string_view groupEp, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view groupTp, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t quantMode, \
+    int64_t globalBs)
+{
+    // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr， 然后output
+    vector<char> groupEpChrs(groupEp.begin(), groupEp.end());
+    groupEpChrs.push_back('\0');
+    char *groupEpPtr = &groupEpChrs[0];
+    vector<char> groupTpChrs(groupTp.begin(), groupTp.end());
+    groupTpChrs.push_back('\0');
+    char *groupTpPtr = &groupTpChrs[0];
+
+    auto recvCountCpu = recvCount.to(at::kCPU);
+    auto recvCountPtr = recvCountCpu.data_ptr<int>();
+    auto hidden = static_cast<int>(x.size(1));
+    int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1];
+    int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens;
+    auto expandxOut = at::zeros({totalCnt, hidden}, x.options());
+    auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device()));
+    auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device()));
+
+    return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> moe_dispatch_normal_impl(
+    const at::Tensor &x, \
+    const at::Tensor &topkIdx, \
+    const at::Tensor &sendOffset, \
+    const at::Tensor &sendTokenIdx, \
+    const at::Tensor &recvOffset, \
+    const at::Tensor &recvCount, \
+    c10::string_view groupEp, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view groupTp, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t quantMode, \
+    int64_t globalBs)
+{
+    static auto op = torch::Dispatcher::singleton()
+                        .findSchemaOrThrow("umdk_cam_op_lib::moe_dispatch_normal", "")
+                        .typed<decltype(moe_dispatch_normal_impl)>();
+    return op.call(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, groupEp, epWorldSize, epRankId, \
+        groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs);
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class ExtMoeDispatchNormal : public torch::autograd::Function<ExtMoeDispatchNormal> {
+public:
+    static tensor_list forward(
+        AutogradContext *ctx, \
+        const at::Tensor &x, \
+        const at::Tensor &topkIdx, \
+        const at::Tensor &sendOffset, \
+        const at::Tensor &sendTokenIdx, \
+        const at::Tensor &recvOffset, \
+        const at::Tensor &recvCount, \
+        c10::string_view groupEp, \
+        int64_t epWorldSize, \
+        int64_t epRankId, \
+        c10::string_view groupTp, \
+        int64_t tpWorldSize, \
+        int64_t tpRankId, \
+        int64_t moeExpertNum, \
+        int64_t quantMode, \
+        int64_t globalBs)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        auto result = moe_dispatch_normal_impl(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \
+            recvCount, groupEp, epWorldSize, epRankId, \
+            groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs);
+
+        return {std::get<0>(result), std::get<1>(result), std::get<2>(result)};
+    }
+
+    static tensor_list backward(
+        AutogradContext *ctx, \
+        tensor_list grad_outputs)
+    {
+        return {at::Tensor(), at::Tensor(), at::Tensor()};
+    }
+};
+
+tensor_list moe_dispatch_normal_impl_autograd(
+    const at::Tensor &x, \
+    const at::Tensor &topkIdx, \
+    const at::Tensor &sendOffset, \
+    const at::Tensor &sendTokenIdx, \
+    const at::Tensor &recvOffset, \
+    const at::Tensor &recvCount, \
+    c10::string_view groupEp, \
+    int64_t epWorldSize, \
+    int64_t epRankId, \
+    c10::string_view groupTp, \
+    int64_t tpWorldSize, \
+    int64_t tpRankId, \
+    int64_t moeExpertNum, \
+    int64_t quantMode, \
+    int64_t globalBs)
+{
+    auto result = ExtMoeDispatchNormal::apply(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \
+        recvCount, groupEp, epWorldSize, epRankId, \
+        groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs);
+    return result;
+}
+
+// moe_dispatch_normal
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m)
+{
+    m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_npu);
+    m.impl("moe_dispatch_normal_backward", &moe_dispatch_normal_backward_impl_npu);
+}
+
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m)
+{
+    m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m)
+{
+    m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_meta);
+}
\ No newline at end of file
diff --git a/src/cam/comm_operator/pybind/pybind.cpp b/src/cam/comm_operator/pybind/pybind.cpp
index a8282f158d414cece867f93f0c3a523e4a6fe259..65a40671265720b01ff1cc1902a9354b86c063d5 100644
--- a/src/cam/comm_operator/pybind/pybind.cpp
+++ b/src/cam/comm_operator/pybind/pybind.cpp
@@ -13,6 +13,8 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("fused_deep_moe", &fused_deep_moe_impl_autograd, "fused_deep_moe");
+    m.def("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd, "moe_dispatch_normal");
+    m.def("moe_combine_normal", &moe_combine_normal_impl_autograd, "moe_combine_normal");
 }
 
 TORCH_LIBRARY(umdk_cam_op_lib, m) {
@@ -20,4 +22,10 @@ TORCH_LIBRARY(umdk_cam_op_lib, m) {
     Tensor gmm2Weight, Tensor gmm2WeightScale, Tensor expertSmoothScalesOptional, Tensor expertScalesOptional, \
     str groupEp, int epRankSize, int epRankId, int moeExpertNum, int shareExpertNum, int shareExpertRankNum, \
     int quantMode, int globalBs) -> Tensor");
+    m.def("moe_dispatch_normal(Tensor x, Tensor topkIdx, Tensor sendOffset, Tensor sendTokenIdx, Tensor recvOffset, \
+    Tensor recvCount, str groupEp, int epWorldSize, int epRankId, str groupTp, int tpWorldSize, int tpRankId, \
+    int moeExpertNum, int quantMode, int globalBs) -> (Tensor, Tensor, Tensor)");
+    m.def("moe_combine_normal(Tensor recvX, Tensor tokenSrcInfo, Tensor epRecvCounts, Tensor recvTopkWeights, \
+    Tensor? tpRecvCounts, str epGroupName, int epWorldSize, int epRankId, str tpGroupName, int tpWorldSize, \
+    int tpRankId, int moeExpertNum, int globalBs) -> Tensor");
 }
\ No newline at end of file
diff --git a/src/cam/comm_operator/pybind/setup.py b/src/cam/comm_operator/pybind/setup.py
index 3e6e00339f3badf17dc553819ee352d56651b7c0..dd46a73c96da0cf6129982e1d96500f4925b80d2 100644
--- a/src/cam/comm_operator/pybind/setup.py
+++ b/src/cam/comm_operator/pybind/setup.py
@@ -75,6 +75,8 @@ ext1 = NpuExtension(
         'gcov',
         'runtime', 'torch', 'ascendcl', 'profapi', 'opapi', 'cust_opapi'],
     sources=["./fused_deep_moe.cpp",
+             "./moe_dispatch_normal.cpp",
+             "./moe_combine_normal.cpp",
             "./pybind.cpp",
             ],