From 5415f56fdccf7f84b63c98658dd4b625b6b46bec Mon Sep 17 00:00:00 2001 From: lee-yuxing Date: Fri, 12 Dec 2025 17:03:35 +0800 Subject: [PATCH] cam: fix pybind normal dispatch & combine whl packet compile issue --- build/cam/build.sh | 0 build/cam/comm_operator/build.sh | 0 build/cam/comm_operator/build_pybind.sh | 4 +- src/cam/comm_operator/pybind/functions.h | 34 +++ .../comm_operator/pybind/fused_deep_moe.cpp | 2 +- .../pybind/moe_combine_normal.cpp | 184 +++++++++++++++ .../pybind/moe_dispatch_normal.cpp | 215 ++++++++++++++++++ src/cam/comm_operator/pybind/pybind.cpp | 8 + src/cam/comm_operator/pybind/setup.py | 2 + 9 files changed, 446 insertions(+), 3 deletions(-) mode change 100644 => 100755 build/cam/build.sh mode change 100644 => 100755 build/cam/comm_operator/build.sh create mode 100644 src/cam/comm_operator/pybind/moe_combine_normal.cpp create mode 100644 src/cam/comm_operator/pybind/moe_dispatch_normal.cpp diff --git a/build/cam/build.sh b/build/cam/build.sh old mode 100644 new mode 100755 diff --git a/build/cam/comm_operator/build.sh b/build/cam/comm_operator/build.sh old mode 100644 new mode 100755 diff --git a/build/cam/comm_operator/build_pybind.sh b/build/cam/comm_operator/build_pybind.sh index c0c9593d..8ef99e3f 100644 --- a/build/cam/comm_operator/build_pybind.sh +++ b/build/cam/comm_operator/build_pybind.sh @@ -7,7 +7,7 @@ # History: 2025-12-09 create pybind building script set -e -EXT_PATH=$MODULE_BUILD_PATH/pybind/ +EXT_PATH=$MODULE_BUILD_PATH/pybind DIST_OUT_PATH=$MODULE_BUILD_OUT_PATH if [ ! -d "$MODULE_BUILD_PATH" ]; then @@ -22,7 +22,7 @@ build_pybind() { cp -rf $MODULE_SRC_PATH/pybind/pytorch_extension $BUILD_PATH cd $EXT_PATH python3 setup.py bdist_wheel - DIST_GEN_PATH=$EXT_PATH/dist/ + DIST_GEN_PATH=$EXT_PATH/dist if [ -d "$DIST_GEN_PATH" ]; then echo "copy $DIST_GEN_PATH to $DIST_OUT_PATH/" cp -rf $DIST_GEN_PATH $DIST_OUT_PATH diff --git a/src/cam/comm_operator/pybind/functions.h b/src/cam/comm_operator/pybind/functions.h index 305a8f49..8f583349 100644 --- a/src/cam/comm_operator/pybind/functions.h +++ b/src/cam/comm_operator/pybind/functions.h @@ -33,4 +33,38 @@ at::Tensor fused_deep_moe_impl_autograd( int64_t shareExpertRankNum, \ int64_t quantMode, \ int64_t globalBs); + +std::tuple +moe_dispatch_normal_impl_autograd( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs); + +at::Tensor +moe_combine_normal_impl_autograd( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const std::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs); #endif // COMMON_OPS_CSRC_FUNCTIONS_H_ \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/fused_deep_moe.cpp b/src/cam/comm_operator/pybind/fused_deep_moe.cpp index b1103453..8ea3acb6 100644 --- a/src/cam/comm_operator/pybind/fused_deep_moe.cpp +++ b/src/cam/comm_operator/pybind/fused_deep_moe.cpp @@ -65,7 +65,7 @@ at::Tensor fused_deep_moe_impl_npu( return output; } -std::tuple fused_deep_moe_backward_impl_npu(const at::Tensor &self) +tensor_list fused_deep_moe_backward_impl_npu(const at::Tensor &self) { at::Tensor result = at::Tensor(self); // 创建输出内存 return {result, result}; diff --git a/src/cam/comm_operator/pybind/moe_combine_normal.cpp b/src/cam/comm_operator/pybind/moe_combine_normal.cpp new file mode 100644 index 00000000..570e9e0c --- /dev/null +++ b/src/cam/comm_operator/pybind/moe_combine_normal.cpp @@ -0,0 +1,184 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: add moe_combine_normal pybind extention file + * Create: 2025-12-10 + * Note: + * History: 2025-12-10 create moe_combine_normal pybind extention file + */ + +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "pytorch_npu_helper.hpp" +#include +#include + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; +using namespace std; + +constexpr int KERNEL_PARAM_CNT = 3; + +at::Tensor moe_combine_normal_impl_npu( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + std::vector weightBlocks; + if (recvTopkWeights.size(0) != 0) { + weightBlocks.emplace_back(recvTopkWeights); + } + at::Tensor expertScales = torch::cat(weightBlocks, 0); + + // Combine data + auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options()); + + EXEC_NPU_CMD(aclnnMoeCombineNormal, + // input + recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + // attr + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs, \ + // output + combinedX); + return combinedX; +} + +tensor_list moe_combine_normal_backward_impl_npu(const at::Tensor &self) +{ + return {at::Tensor(), at::Tensor(), at::Tensor()}; +} + +at::Tensor moe_combine_normal_impl_meta( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + std::vector weightBlocks; + if (recvTopkWeights.size(0) != 0) { + weightBlocks.emplace_back(recvTopkWeights); + } + at::Tensor expertScales = torch::cat(weightBlocks, 0); + + // Combine data + auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options()); + + return combinedX; +} + +at::Tensor moe_combine_normal_impl( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("umdk_cam_op_lib::moe_combine_normal", "") + .typed(); + return op.call(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class ExtMoeCombineNormal : public torch::autograd::Function { +public: + static at::Tensor forward( + AutogradContext *ctx, \ + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) + { + at::AutoDispatchBelowADInplaceOrView guard; + auto result = moe_combine_normal_impl(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); + return result; + } + + static tensor_list backward( + AutogradContext *ctx, \ + tensor_list grad_outputs) + { + return {at::Tensor(), at::Tensor(), at::Tensor()}; + } +}; + +at::Tensor moe_combine_normal_impl_autograd( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + auto result = ExtMoeCombineNormal::apply(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); + return result; +} + +// moe_dispatch_normal +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_npu); + m.impl("moe_combine_normal_backward", &moe_combine_normal_backward_impl_npu); +} + +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_autograd); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_meta); +} \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp new file mode 100644 index 00000000..31a2987f --- /dev/null +++ b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp @@ -0,0 +1,215 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: add moe_dispatch_normal pybind extention file + * Create: 2025-12-10 + * Note: + * History: 2025-12-10 create moe_dispatch_normal pybind extention file + */ + +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "pytorch_npu_helper.hpp" +#include +#include + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; +using namespace std; + +constexpr int KERNEL_PARAM_CNT = 3; + +std::tuple moe_dispatch_normal_impl_npu( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs +) +{ + // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr, 然后output + vector groupEpChrs(groupEp.begin(), groupEp.end()); + groupEpChrs.push_back('\0'); + char *groupEpPtr = &groupEpChrs[0]; + vector groupTpChrs(groupTp.begin(), groupTp.end()); + groupTpChrs.push_back('\0'); + char *groupTpPtr = &groupTpChrs[0]; + + auto recvCountCpu = recvCount.to(at::kCPU); + auto recvCountPtr = recvCountCpu.data_ptr(); + auto hidden = static_cast(x.size(1)); + int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1]; + int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens; + auto expandxOut = at::zeros({totalCnt, hidden}, x.options()); + auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device())); + auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device())); + + EXEC_NPU_CMD(aclnnMoeDispatchNormal, + // input + x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, \ + // attr + groupEpPtr, epWorldSize, epRankId, groupTpPtr, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs, \ + // output + expandxOut, dynamicScalesOut, expandIdxOut); + return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut); +} + +tensor_list moe_dispatch_normal_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + return {result, result, result}; +} + +std::tuple moe_dispatch_normal_impl_meta( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr, 然后output + vector groupEpChrs(groupEp.begin(), groupEp.end()); + groupEpChrs.push_back('\0'); + char *groupEpPtr = &groupEpChrs[0]; + vector groupTpChrs(groupTp.begin(), groupTp.end()); + groupTpChrs.push_back('\0'); + char *groupTpPtr = &groupTpChrs[0]; + + auto recvCountCpu = recvCount.to(at::kCPU); + auto recvCountPtr = recvCountCpu.data_ptr(); + auto hidden = static_cast(x.size(1)); + int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1]; + int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens; + auto expandxOut = at::zeros({totalCnt, hidden}, x.options()); + auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device())); + auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device())); + + return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut); +} + +std::tuple moe_dispatch_normal_impl( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("umdk_cam_op_lib::moe_dispatch_normal", "") + .typed(); + return op.call(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class ExtMoeDispatchNormal : public torch::autograd::Function { +public: + static tensor_list forward( + AutogradContext *ctx, \ + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) + { + at::AutoDispatchBelowADInplaceOrView guard; + auto result = moe_dispatch_normal_impl(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \ + recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); + + return {std::get<0>(result), std::get<1>(result), std::get<2>(result)}; + } + + static tensor_list backward( + AutogradContext *ctx, \ + tensor_list grad_outputs) + { + return {at::Tensor(), at::Tensor(), at::Tensor()}; + } +}; + +tensor_list moe_dispatch_normal_impl_autograd( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + auto result = ExtMoeDispatchNormal::apply(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \ + recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); + return result; +} + +// moe_dispatch_normal +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_npu); + m.impl("moe_dispatch_normal_backward", &moe_dispatch_normal_backward_impl_npu); +} + +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_meta); +} \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/pybind.cpp b/src/cam/comm_operator/pybind/pybind.cpp index a8282f15..65a40671 100644 --- a/src/cam/comm_operator/pybind/pybind.cpp +++ b/src/cam/comm_operator/pybind/pybind.cpp @@ -13,6 +13,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("fused_deep_moe", &fused_deep_moe_impl_autograd, "fused_deep_moe"); + m.def("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd, "moe_dispatch_normal"); + m.def("moe_combine_normal", &moe_combine_normal_impl_autograd, "moe_combine_normal"); } TORCH_LIBRARY(umdk_cam_op_lib, m) { @@ -20,4 +22,10 @@ TORCH_LIBRARY(umdk_cam_op_lib, m) { Tensor gmm2Weight, Tensor gmm2WeightScale, Tensor expertSmoothScalesOptional, Tensor expertScalesOptional, \ str groupEp, int epRankSize, int epRankId, int moeExpertNum, int shareExpertNum, int shareExpertRankNum, \ int quantMode, int globalBs) -> Tensor"); + m.def("moe_dispatch_normal(Tensor x, Tensor topkIdx, Tensor sendOffset, Tensor sendTokenIdx, Tensor recvOffset, \ + Tensor recvCount, str groupEp, int epWorldSize, int epRankId, str groupTp, int tpWorldSize, int tpRankId, \ + int moeExpertNum, int quantMode, int globalBs) -> (Tensor, Tensor, Tensor)"); + m.def("moe_combine_normal(Tensor recvX, Tensor tokenSrcInfo, Tensor epRecvCounts, Tensor recvTopkWeights, \ + Tensor? tpRecvCounts, str epGroupName, int epWorldSize, int epRankId, str tpGroupName, int tpWorldSize, \ + int tpRankId, int moeExpertNum, int globalBs) -> Tensor"); } \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/setup.py b/src/cam/comm_operator/pybind/setup.py index 3e6e0033..dd46a73c 100644 --- a/src/cam/comm_operator/pybind/setup.py +++ b/src/cam/comm_operator/pybind/setup.py @@ -75,6 +75,8 @@ ext1 = NpuExtension( 'gcov', 'runtime', 'torch', 'ascendcl', 'profapi', 'opapi', 'cust_opapi'], sources=["./fused_deep_moe.cpp", + "./moe_dispatch_normal.cpp", + "./moe_combine_normal.cpp", "./pybind.cpp", ], -- Gitee