diff --git a/build/cam/build.sh b/build/cam/build.sh old mode 100644 new mode 100755 diff --git a/build/cam/comm_operator/build.sh b/build/cam/comm_operator/build.sh old mode 100644 new mode 100755 diff --git a/build/cam/comm_operator/build_pybind.sh b/build/cam/comm_operator/build_pybind.sh index c0c9593d78b38b89b8f79445f0afb4ce1881c850..8ef99e3fb4a62f01ead9a5304705a90a68de0297 100644 --- a/build/cam/comm_operator/build_pybind.sh +++ b/build/cam/comm_operator/build_pybind.sh @@ -7,7 +7,7 @@ # History: 2025-12-09 create pybind building script set -e -EXT_PATH=$MODULE_BUILD_PATH/pybind/ +EXT_PATH=$MODULE_BUILD_PATH/pybind DIST_OUT_PATH=$MODULE_BUILD_OUT_PATH if [ ! -d "$MODULE_BUILD_PATH" ]; then @@ -22,7 +22,7 @@ build_pybind() { cp -rf $MODULE_SRC_PATH/pybind/pytorch_extension $BUILD_PATH cd $EXT_PATH python3 setup.py bdist_wheel - DIST_GEN_PATH=$EXT_PATH/dist/ + DIST_GEN_PATH=$EXT_PATH/dist if [ -d "$DIST_GEN_PATH" ]; then echo "copy $DIST_GEN_PATH to $DIST_OUT_PATH/" cp -rf $DIST_GEN_PATH $DIST_OUT_PATH diff --git a/src/cam/comm_operator/pybind/functions.h b/src/cam/comm_operator/pybind/functions.h index 305a8f49b5a612ca6036f6be835311cf9c515fcc..8f583349f9d1b851242ff3047aec8d22e22d69ae 100644 --- a/src/cam/comm_operator/pybind/functions.h +++ b/src/cam/comm_operator/pybind/functions.h @@ -33,4 +33,38 @@ at::Tensor fused_deep_moe_impl_autograd( int64_t shareExpertRankNum, \ int64_t quantMode, \ int64_t globalBs); + +std::tuple +moe_dispatch_normal_impl_autograd( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs); + +at::Tensor +moe_combine_normal_impl_autograd( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const std::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs); #endif // COMMON_OPS_CSRC_FUNCTIONS_H_ \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/fused_deep_moe.cpp b/src/cam/comm_operator/pybind/fused_deep_moe.cpp index b1103453b8036f987279e390da3b7f892145345d..8ea3acb61ecf4b1601bad15fe4b6f08e08ed88bc 100644 --- a/src/cam/comm_operator/pybind/fused_deep_moe.cpp +++ b/src/cam/comm_operator/pybind/fused_deep_moe.cpp @@ -65,7 +65,7 @@ at::Tensor fused_deep_moe_impl_npu( return output; } -std::tuple fused_deep_moe_backward_impl_npu(const at::Tensor &self) +tensor_list fused_deep_moe_backward_impl_npu(const at::Tensor &self) { at::Tensor result = at::Tensor(self); // 创建输出内存 return {result, result}; diff --git a/src/cam/comm_operator/pybind/moe_combine_normal.cpp b/src/cam/comm_operator/pybind/moe_combine_normal.cpp new file mode 100644 index 0000000000000000000000000000000000000000..570e9e0c4d6c161deaf0efe13cd41e2125c69671 --- /dev/null +++ b/src/cam/comm_operator/pybind/moe_combine_normal.cpp @@ -0,0 +1,184 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: add moe_combine_normal pybind extention file + * Create: 2025-12-10 + * Note: + * History: 2025-12-10 create moe_combine_normal pybind extention file + */ + +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "pytorch_npu_helper.hpp" +#include +#include + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; +using namespace std; + +constexpr int KERNEL_PARAM_CNT = 3; + +at::Tensor moe_combine_normal_impl_npu( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + std::vector weightBlocks; + if (recvTopkWeights.size(0) != 0) { + weightBlocks.emplace_back(recvTopkWeights); + } + at::Tensor expertScales = torch::cat(weightBlocks, 0); + + // Combine data + auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options()); + + EXEC_NPU_CMD(aclnnMoeCombineNormal, + // input + recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + // attr + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs, \ + // output + combinedX); + return combinedX; +} + +tensor_list moe_combine_normal_backward_impl_npu(const at::Tensor &self) +{ + return {at::Tensor(), at::Tensor(), at::Tensor()}; +} + +at::Tensor moe_combine_normal_impl_meta( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + std::vector weightBlocks; + if (recvTopkWeights.size(0) != 0) { + weightBlocks.emplace_back(recvTopkWeights); + } + at::Tensor expertScales = torch::cat(weightBlocks, 0); + + // Combine data + auto combinedX = torch::empty({expertScales.size(0), recvX.size(1)}, recvX.options()); + + return combinedX; +} + +at::Tensor moe_combine_normal_impl( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("umdk_cam_op_lib::moe_combine_normal", "") + .typed(); + return op.call(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class ExtMoeCombineNormal : public torch::autograd::Function { +public: + static at::Tensor forward( + AutogradContext *ctx, \ + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) + { + at::AutoDispatchBelowADInplaceOrView guard; + auto result = moe_combine_normal_impl(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); + return result; + } + + static tensor_list backward( + AutogradContext *ctx, \ + tensor_list grad_outputs) + { + return {at::Tensor(), at::Tensor(), at::Tensor()}; + } +}; + +at::Tensor moe_combine_normal_impl_autograd( + const at::Tensor &recvX, \ + const at::Tensor &tokenSrcInfo, \ + const at::Tensor &epRecvCounts, \ + const at::Tensor &recvTopkWeights, \ + const c10::optional &tpRecvCounts, \ + c10::string_view epGroupName, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view tpGroupName, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t globalBs) +{ + auto result = ExtMoeCombineNormal::apply(recvX, tokenSrcInfo, epRecvCounts, recvTopkWeights, tpRecvCounts, \ + epGroupName, epWorldSize, epRankId, tpGroupName, tpWorldSize, tpRankId, moeExpertNum, globalBs); + return result; +} + +// moe_dispatch_normal +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_npu); + m.impl("moe_combine_normal_backward", &moe_combine_normal_backward_impl_npu); +} + +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_autograd); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m) +{ + m.impl("moe_combine_normal", &moe_combine_normal_impl_meta); +} \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31a2987f28fa388d4926e940b2e825e43dea7d2c --- /dev/null +++ b/src/cam/comm_operator/pybind/moe_dispatch_normal.cpp @@ -0,0 +1,215 @@ +/* + * SPDX-License-Identifier: MIT + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * Description: add moe_dispatch_normal pybind extention file + * Create: 2025-12-10 + * Note: + * History: 2025-12-10 create moe_dispatch_normal pybind extention file + */ + +#include +#include +#include +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "pytorch_npu_helper.hpp" +#include +#include + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; +using namespace std; + +constexpr int KERNEL_PARAM_CNT = 3; + +std::tuple moe_dispatch_normal_impl_npu( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs +) +{ + // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr, 然后output + vector groupEpChrs(groupEp.begin(), groupEp.end()); + groupEpChrs.push_back('\0'); + char *groupEpPtr = &groupEpChrs[0]; + vector groupTpChrs(groupTp.begin(), groupTp.end()); + groupTpChrs.push_back('\0'); + char *groupTpPtr = &groupTpChrs[0]; + + auto recvCountCpu = recvCount.to(at::kCPU); + auto recvCountPtr = recvCountCpu.data_ptr(); + auto hidden = static_cast(x.size(1)); + int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1]; + int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens; + auto expandxOut = at::zeros({totalCnt, hidden}, x.options()); + auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device())); + auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device())); + + EXEC_NPU_CMD(aclnnMoeDispatchNormal, + // input + x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, \ + // attr + groupEpPtr, epWorldSize, epRankId, groupTpPtr, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs, \ + // output + expandxOut, dynamicScalesOut, expandIdxOut); + return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut); +} + +tensor_list moe_dispatch_normal_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + return {result, result, result}; +} + +std::tuple moe_dispatch_normal_impl_meta( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + // 必须要求对齐fused_deep_moe.cpp 先input 跟着 attr, 然后output + vector groupEpChrs(groupEp.begin(), groupEp.end()); + groupEpChrs.push_back('\0'); + char *groupEpPtr = &groupEpChrs[0]; + vector groupTpChrs(groupTp.begin(), groupTp.end()); + groupTpChrs.push_back('\0'); + char *groupTpPtr = &groupTpChrs[0]; + + auto recvCountCpu = recvCount.to(at::kCPU); + auto recvCountPtr = recvCountCpu.data_ptr(); + auto hidden = static_cast(x.size(1)); + int64_t totalRecvTokens = recvCountPtr[moeExpertNum - 1]; + int totalCnt = totalRecvTokens == 0 ? 1 : totalRecvTokens; + auto expandxOut = at::zeros({totalCnt, hidden}, x.options()); + auto dynamicScalesOut = at::zeros({totalCnt}, at::dtype(at::kFloat).device(x.device())); + auto expandIdxOut = at::zeros({totalCnt * KERNEL_PARAM_CNT}, at::dtype(at::kInt).device(x.device())); + + return std::make_tuple(expandxOut, dynamicScalesOut, expandIdxOut); +} + +std::tuple moe_dispatch_normal_impl( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("umdk_cam_op_lib::moe_dispatch_normal", "") + .typed(); + return op.call(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class ExtMoeDispatchNormal : public torch::autograd::Function { +public: + static tensor_list forward( + AutogradContext *ctx, \ + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) + { + at::AutoDispatchBelowADInplaceOrView guard; + auto result = moe_dispatch_normal_impl(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \ + recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); + + return {std::get<0>(result), std::get<1>(result), std::get<2>(result)}; + } + + static tensor_list backward( + AutogradContext *ctx, \ + tensor_list grad_outputs) + { + return {at::Tensor(), at::Tensor(), at::Tensor()}; + } +}; + +tensor_list moe_dispatch_normal_impl_autograd( + const at::Tensor &x, \ + const at::Tensor &topkIdx, \ + const at::Tensor &sendOffset, \ + const at::Tensor &sendTokenIdx, \ + const at::Tensor &recvOffset, \ + const at::Tensor &recvCount, \ + c10::string_view groupEp, \ + int64_t epWorldSize, \ + int64_t epRankId, \ + c10::string_view groupTp, \ + int64_t tpWorldSize, \ + int64_t tpRankId, \ + int64_t moeExpertNum, \ + int64_t quantMode, \ + int64_t globalBs) +{ + auto result = ExtMoeDispatchNormal::apply(x, topkIdx, sendOffset, sendTokenIdx, recvOffset, \ + recvCount, groupEp, epWorldSize, epRankId, \ + groupTp, tpWorldSize, tpRankId, moeExpertNum, quantMode, globalBs); + return result; +} + +// moe_dispatch_normal +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, PrivateUse1, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_npu); + m.impl("moe_dispatch_normal_backward", &moe_dispatch_normal_backward_impl_npu); +} + +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, AutogradPrivateUse1, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(umdk_cam_op_lib, Meta, m) +{ + m.impl("moe_dispatch_normal", &moe_dispatch_normal_impl_meta); +} \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/pybind.cpp b/src/cam/comm_operator/pybind/pybind.cpp index a8282f158d414cece867f93f0c3a523e4a6fe259..65a40671265720b01ff1cc1902a9354b86c063d5 100644 --- a/src/cam/comm_operator/pybind/pybind.cpp +++ b/src/cam/comm_operator/pybind/pybind.cpp @@ -13,6 +13,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("fused_deep_moe", &fused_deep_moe_impl_autograd, "fused_deep_moe"); + m.def("moe_dispatch_normal", &moe_dispatch_normal_impl_autograd, "moe_dispatch_normal"); + m.def("moe_combine_normal", &moe_combine_normal_impl_autograd, "moe_combine_normal"); } TORCH_LIBRARY(umdk_cam_op_lib, m) { @@ -20,4 +22,10 @@ TORCH_LIBRARY(umdk_cam_op_lib, m) { Tensor gmm2Weight, Tensor gmm2WeightScale, Tensor expertSmoothScalesOptional, Tensor expertScalesOptional, \ str groupEp, int epRankSize, int epRankId, int moeExpertNum, int shareExpertNum, int shareExpertRankNum, \ int quantMode, int globalBs) -> Tensor"); + m.def("moe_dispatch_normal(Tensor x, Tensor topkIdx, Tensor sendOffset, Tensor sendTokenIdx, Tensor recvOffset, \ + Tensor recvCount, str groupEp, int epWorldSize, int epRankId, str groupTp, int tpWorldSize, int tpRankId, \ + int moeExpertNum, int quantMode, int globalBs) -> (Tensor, Tensor, Tensor)"); + m.def("moe_combine_normal(Tensor recvX, Tensor tokenSrcInfo, Tensor epRecvCounts, Tensor recvTopkWeights, \ + Tensor? tpRecvCounts, str epGroupName, int epWorldSize, int epRankId, str tpGroupName, int tpWorldSize, \ + int tpRankId, int moeExpertNum, int globalBs) -> Tensor"); } \ No newline at end of file diff --git a/src/cam/comm_operator/pybind/setup.py b/src/cam/comm_operator/pybind/setup.py index 3e6e00339f3badf17dc553819ee352d56651b7c0..dd46a73c96da0cf6129982e1d96500f4925b80d2 100644 --- a/src/cam/comm_operator/pybind/setup.py +++ b/src/cam/comm_operator/pybind/setup.py @@ -75,6 +75,8 @@ ext1 = NpuExtension( 'gcov', 'runtime', 'torch', 'ascendcl', 'profapi', 'opapi', 'cust_opapi'], sources=["./fused_deep_moe.cpp", + "./moe_dispatch_normal.cpp", + "./moe_combine_normal.cpp", "./pybind.cpp", ],