diff --git a/LICENSE b/LICENSE index 72a0affe0e736781fc243ff6ec56571efd279641..b41cefbb51aa768c9ea00d3c92a6261fbaf04456 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ -CANN Open Software License Agreement Version 1.0 +CANN Open Software License Agreement Version 2.0 -This CANN Open Software License Agreement Version 1.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully. +This CANN Open Software License Agreement Version 2.0 (hereinafter referred to as this "Agreement") is a legal agreement between you and Huawei, and it governs your use, modification, or distribution of CANN Open Software (hereinafter referred to as "Software"). Please read this Agreement carefully. If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind that entity to this Agreement, in which case "you" will mean the entity you represent. @@ -10,28 +10,27 @@ BY DOWNLOADING, INSTALLING, OR USING THE SOFTWARE, YOU AGREE YOU HAVE FULLY UNDE 1.1 Software means the APIs, source code files, binaries, and related documents of Compute Architecture for Neural Networks("CANN") that are licensable by Huawei, and provided and licensed under this Agreement. -1.2 Ascend processors means the chipsets branded with "Ascend" that are manufactured and supplied by Huawei. +1.2 Huawei AI Processors mean AI chipsets (i) branded with "Ascend", "Kirin", "Yueying" or other brands owned or controlled by Huawei; or (ii) manufactured (including have manufactured), supplied (including have supplied) or designed (including have designed) by Huawei. 2. Grant of Intellectual Property Rights -Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use with Ascend processors, and (ii) distribute the software developed under (i) solely for use with Ascend processors. +2.1 Subject to the terms and conditions of this Agreement, including your full compliance thereof, Huawei hereby grants you a limited, worldwide, royalty-free, non-transferable, non-sublicensable, and revocable license for you to (i) download, use, modify, integrate, and distribute the Software or its derivative works for the purpose of developing software solely for use in systems with Huawei AI Processors and/or Software, and (ii) distribute any software developed based upon Software and/or its derivative works solely for use in systems with Huawei AI Processors and/or Software. 3. Restrictions -3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any purpose except those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute software for use in systems with processors other than Ascend processors. +3.1 You are not authorized to, and shall not use, modify, or distribute this Software or its derivative works for any other purposes than those expressly permitted by this Agreement. You shall not make any use of the Software or its derivative works to develop or distribute any software for use in systems with processors other than Huawei AI processors. All rights not expressly granted herein are expressly reserved by Huawei. 3.2 You are not authorized to, and shall not remove, obscure, or alter any copyright or other notices in this Software or any part of it. -3.3 Distribution Restrictions. -You may distribute the Software or its derivative works in any medium, whether in source or executable forms, provided that you comply with the purpose restriction stipulated in Section 2, provide recipients with a copy of this Agreement, and retain all notices in the Software. +3.3 Distribution Restrictions +You may distribute the Software or its derivative works in any medium, whether in source or executable forms, for the purpose stipulated in Section 2; provided that you provide recipients with a copy of this Agreement, and retain all notices in the Software. 4. Disclaimer of Warranty and Limitation of Liability -THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT’S CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +THE SOFTWARE IS PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL HUAWEI OR ANY OTHER COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE, IN WHOLE OR IN PART, NO MATTER HOW IT IS CAUSED OR THE LEGAL THEORY IT IS BASED ON, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 5. Termination 5.1 This Agreement will continue to apply until terminated by either you or Huawei as described below: a.You may terminate this Agreement by ceasing your use of the Software; b. Huawei may at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement; or (ii) you directly or indirectly initiate any legal proceeding against any individual or entity by alleging that the Software or any part of it infringes your intellectual property rights. - -5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or its derivative works immediately. +5.2 By termination, all the rights granted to you under this Agreement are terminated, and you shall cease to use and delete this Software or any derivative works immediately. All rights granted to you under this Agreement shall hereby be void ab initio in the event of termination in accordance with Section 5.1. b above. Huawei reserves the right to pursue any and all legal remedies available to enforce the terms and conditions of this Agreement or to protect Huawei’s intellectual property rights for such breach or violation. All provisions shall survive the termination of this Agreement except for Section 2 and Section 3.3. 6. MISCELLANEOUS If the application of any provision of this Agreement to any particular facts or circumstances is held to be invalid or unenforceable by a court of competent jurisdiction, then (a) the validity and enforceability of such provision as applied to any other particular facts or circumstances and the validity of other provisions of this Agreement shall not in any way be affected or impaired thereby and (b) such provision shall be enforced to the maximum extent possible so as to affect the intent of the you and Huawei and reformed without further action by you and Huawei to the extent necessary to make such provision valid and enforceable. diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md new file mode 100644 index 0000000000000000000000000000000000000000..358fa460c8e2133169e7da139d0e6b0c45e67e77 --- /dev/null +++ b/atvc/examples/reduce_max/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_max.cpp](./reduce_max.cpp) | ReduceMax算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMax是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMax算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMax
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_max +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c5ce2bce6c0762f8c42fe0c62846318657fb9366 --- /dev/null +++ b/atvc/examples/reduce_max/reduce_max.cpp @@ -0,0 +1,177 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md new file mode 100644 index 0000000000000000000000000000000000000000..af85b2bc89edbb7ff9faa90e55bc8a6ce57cf91f --- /dev/null +++ b/atvc/examples/reduce_min/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_min.cpp](./reduce_min.cpp) | ReduceMin算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMin是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMin算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMin
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_min +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp new file mode 100644 index 0000000000000000000000000000000000000000..525f57e96d95039b5d7f3b92db99c0583c350ace --- /dev/null +++ b/atvc/examples/reduce_min/reduce_min.cpp @@ -0,0 +1,177 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h index 066054c24377ac5d4c2ace93c3507602dbd69e7c..e9b09c902fbf6e013b7123bafe59c21d185f7fa3 100644 --- a/atvc/include/atvc.h +++ b/atvc/include/atvc.h @@ -28,6 +28,8 @@ #include "common/kernel_utils.h" #include "elewise/elewise_op_template.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #include "broadcast/broadcast_compute.h" #include "broadcast/broadcast_op_template.h" diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index d85353e8f249462d1e9089931c1f6301f6c44311..69d21aa4cd5b0b52e6c6eccae956d19b054fa613 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -28,6 +28,18 @@ enum ShapeDim { DIM_BROADCAST // Broadcast axis }; +struct ReduceARParam { + uint32_t repStride = 0; + uint16_t dimA = 0; + uint16_t dimMax = 0; + uint16_t mainR = 0; + uint16_t tailR = 0; + uint64_t maskAddRNum = 0; + uint16_t loopRNum = 0; + uint16_t dtypeSize = 0; + uint16_t dimR = 0; +}; + namespace AR_PATTERN { static constexpr uint32_t A = 100; static constexpr uint32_t AR = 11; diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h index ac454343399c9e9b730f0e4476852fd3112ea306..c30a69abaee415de788c1cbca3cb839dc0e50ed9 100644 --- a/atvc/include/reduce/reduce_device.h +++ b/atvc/include/reduce/reduce_device.h @@ -24,6 +24,8 @@ #include "common/kernel_utils.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #endif // ATVC_REDUCE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h new file mode 100644 index 0000000000000000000000000000000000000000..abb540469f2e06a7ec46edc5c32d061ce7d3b438 --- /dev/null +++ b/atvc/include/reduce/reduce_max.h @@ -0,0 +1,349 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MAX_H +#define ATVC_REDUCE_MAX_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMaxCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMaxCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMaxCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMax(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MAX-reduction the neutral element is -∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (-∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMax accumulates R-axis data, the values of the supplemented elements + // are set to -∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MIN; + }else if(AscendC::IsSameType::value){ + return 0; + }else{ + return -1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MAX_H diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h new file mode 100644 index 0000000000000000000000000000000000000000..ad88e5fb9ee4e9a6e4c9d25050ac227f12a0b928 --- /dev/null +++ b/atvc/include/reduce/reduce_min.h @@ -0,0 +1,349 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MIN_H +#define ATVC_REDUCE_MIN_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMinCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMinCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMinCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMin(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MIN-reduction the neutral element is +∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (+∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMin accumulates R-axis data, the values of the supplemented elements + // are set to +∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MAX; + }else if(AscendC::IsSameType::value){ + return INT32_MAX - INT32_MIN; + }else{ + return 1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MIN_H diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index 37a19be9dd94e9b1900e89cc1ec0bb4656f7666e..83bb2604ae2d77c5cceca59665b67932fd0878e0 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -148,7 +148,9 @@ public: template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) { - bufPool_.AllocTensor(tensor); + T DupValue = needDup ? compute_.template GetPaddingValue() : 0; + bufPool_.AllocTensor(tensor, DupValue); + // bufPool_.AllocTensor(tensor); } /*! diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index b94ebad57be1d72b7b8a75ce93af0c218d9e9640..563ed5ecd201a1ef2316480afc73f91be8e4f77f 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -14,20 +14,7 @@ #include "common/kernel_utils.h" #include "reduce/common/patterns.h" #include "reduce/utils/reduce_block_aux_util.h" - -namespace { -struct ReduceARParam { - uint32_t repStride = 0; - uint16_t dimA = 0; - uint16_t dimMax = 0; - uint16_t mainR = 0; - uint16_t tailR = 0; - uint64_t maskAddRNum = 0; - uint16_t loopRNum = 0; - uint16_t dtypeSize = 0; - uint16_t dimR = 0; -}; -} +#include "reduce/common/reduce_common.h" namespace ATVC { /*! diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h index 2fa2359c22fa90e2d5a7b1f0df42f7e2bdb073d0..1c209adc2e56a643b2c5d0d41aa1c5051079040f 100644 --- a/atvc/include/reduce/utils/reduce_block_aux.h +++ b/atvc/include/reduce/utils/reduce_block_aux.h @@ -235,7 +235,7 @@ public: computeTensor = ubTensor; } else { // The index of AlloccomputeTensorAux does not require external perception - op_->ReduceOp::template AllocTensorAux(computeTensor); + op_->ReduceOp::template AllocTensorAux(computeTensor); CopyIn(view, shape, ubTensor); SetEvent(AscendC::HardEvent::MTE2_V); AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]); @@ -247,14 +247,21 @@ public: __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args) { SliceView view; + bool needDup = false; for (int64_t i = 0; i < bisectionTail; i++) { AscendC::LocalTensor tensorLeft; - op_->ReduceOp::template AllocTensorAux(tensorLeft); + op_->ReduceOp::template AllocTensorAux(tensorLeft); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft); AscendC::LocalTensor tensorRight; - op_->ReduceOp::template AllocTensorAux(tensorRight); + needDup = i == bisectionTail - 1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensorRight); + }else{ + op_->ReduceOp::template AllocTensorAux(tensorRight); + } + // op_->ReduceOp::template AllocTensorAux(tensorRight); AscendC::LocalTensor computeRight; PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight); ComputeMerge(shape, computeLeft, computeRight, args...); @@ -267,7 +274,13 @@ public: for (int64_t i = bisectionTail; i < bisectionPos; i++) { AscendC::LocalTensor tensor; - op_->ReduceOp::template AllocTensorAux(tensor); + needDup = i == bisectionPos -1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensor); + }else{ + op_->ReduceOp::template AllocTensorAux(tensor); + } + // op_->ReduceOp::template AllocTensorAux(tensor); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft); Compute(shape, computeLeft, args...); diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h index 26d2ef4ab8f991d1276db8da26090cd66bc860be..b980b1f73f6cd3e3961785e87b079bc340dccba9 100644 --- a/atvc/include/reduce/utils/reduce_buf_pool.h +++ b/atvc/include/reduce/utils/reduce_buf_pool.h @@ -62,7 +62,10 @@ public: // Init buffer GetTPipePtr()->InitBuffer(qQue_, poolSize); AscendC::LocalTensor inputUb = qQue_.GetWithOffset(basicNum_ * inputNum, 0); - AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + // AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + for(int16_t i =0;i - __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor) { + __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor, T DupValue) { if constexpr (temp){ if constexpr (IsInput) { int32_t idx = GetPreTensorId(); @@ -96,11 +99,16 @@ public: int32_t idx = GetInputTensorId(); tensor = qQue_.GetWithOffset(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T)); if constexpr (needDup) { - AscendC::Duplicate(tensor, 0, basicNum_); + AscendC::PipeBarrier(); + AscendC::Duplicate(tensor, DupValue, basicNum_); + } + if(usedTBuf_[idx] || needDup){ event_t allocEventId = static_cast(GetTPipePtr()->FetchEventID()); eventIdV2Mte2_[idx] = allocEventId; + needWaitFlag_[idx] = true; AscendC::SetFlag(allocEventId); } + usedTBuf_[idx] = true; } else { int32_t idx = GetComputeTensorId(); tensor = qQue_.GetWithOffset(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T)); @@ -119,7 +127,11 @@ public: uint64_t offset = (uint64_t)(tensor.GetPhyAddr()); if (offset - start < computeUnit_.offset) { int32_t idx = (offset - start) / sizeof(T) / basicNum_; - AscendC::WaitFlag(eventIdV2Mte2_[idx]); + // AscendC::WaitFlag(eventIdV2Mte2_[idx]); + if(needWaitFlag_[idx]){ + AscendC::WaitFlag(eventIdV2Mte2_[idx]); + needWaitFlag_[idx] = false; + } } } @@ -153,6 +165,8 @@ private: PoolManagerUnit computeUnit_; PoolManagerUnit postUnit_; event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; + bool needWaitFlag_[MAX_INPUT_SIZE]; + bool usedTBuf_[MAX_INPUT_SIZE]; AscendC::TBuf<> qQue_; int32_t basicNum_; }; // class ReduceBufPool diff --git a/cmake/scripts/gen_kernel_tiling_data_def.py b/cmake/scripts/gen_kernel_tiling_data_def.py index 22394286241ddb8ffcf9efdb19691739372f9a16..c8e08618ba74b0f1e8738196aaeba3b29c94a5d3 100644 --- a/cmake/scripts/gen_kernel_tiling_data_def.py +++ b/cmake/scripts/gen_kernel_tiling_data_def.py @@ -15,13 +15,52 @@ import os import re +_NAMESPACE = "AscendC::tiling" +_LEGACY_TILING_STRUCTS = [ + "LogSoftMaxTiling", + "SoftMaxTiling", + "TConv3DApiTiling", + "TConv3DBpFilterTiling", + "Conv3DBpFilterParams", + "TConv3DBpFilterBasicBlockTiling", + "Conv3DBackpropFilterTilingData", + "TConv3DBackpropInputTiling", + "Conv3DBackpropInputTilingData", + "Mc2ServerCfg", + "Mc2HcommCfg", + "Mc2InitTiling", + "Mc2CcTiling", + "TCubeTiling", + "BatchNormTiling", + "DeepNormTiling", + "GroupNormTiling", + "LayerNormGradBetaTiling", + "LayerNormGradTiling", + "LayerNormTiling", + "LayerNormSeparateTiling", + "RmsNormTiling", + "UnPadTiling", + "PadTiling", + "TopkTiling", + "ConfusionTransposeTiling" +] + + def gen_tiling(tiling_header_file): single_tiling_source = "" + single_legacy_tiling_export = "" if not os.path.exists(tiling_header_file): print("warning: no userdef tiling header file: ", tiling_header_file) return single_tiling_source print("generate tiling def header file: ", tiling_header_file) pattern = re.compile(r'[(](.*)[)]', re.S) + + def parse_legacy_tiling(struct_def): + # export legacy tiling structs with 'using namespace' to ensure compatibility + nonlocal single_legacy_tiling_export + if struct_def in _LEGACY_TILING_STRUCTS: + single_legacy_tiling_export += f"using {_NAMESPACE}::{struct_def};\n" + with open(tiling_header_file, 'r') as fd: lines = fd.readlines() for line in lines: @@ -29,8 +68,9 @@ def gen_tiling(tiling_header_file): if (line.startswith('BEGIN_TILING_DATA_DEF')): single_tiling_source += '#pragma pack(push, 8)\n' single_tiling_source += 'struct ' - struct_def = re.findall(pattern, line)[0] + struct_def = re.findall(pattern, line)[0] single_tiling_source += struct_def + ' {\n' + parse_legacy_tiling(struct_def) elif (line.startswith('TILING_DATA_FIELD_DEF_ARR')): field_params = re.findall(pattern, line)[0] fds = field_params.split(',') @@ -46,7 +86,7 @@ def gen_tiling(tiling_header_file): elif (line.startswith('END_TILING_DATA_DEF')): single_tiling_source += '};\n' single_tiling_source += '#pragma pack(pop)\n' - return single_tiling_source + return single_tiling_source, single_legacy_tiling_export @@ -62,6 +102,7 @@ if __name__ == '__main__': #endif """ + res += "namespace AscendC {\nnamespace tiling {\n" print("[LOG]: ", sys.argv[1], sys.argv[2], sys.argv[3]) src_tiling_data_path = sys.argv[1] file_list = [] @@ -79,8 +120,17 @@ if __name__ == '__main__': if file.endswith("tilingdata.h") and file not in file_set: file_list.append(os.path.join(root, file)) file_list.sort() + + tiling_source = "" + legacy_tiling_export = "" for file in file_list: - res += gen_tiling(file) + src, exp = gen_tiling(file) + tiling_source += src + legacy_tiling_export += exp + + res += tiling_source + "} // namespace tiling\n} // namespace AscendC\n\n" + res += legacy_tiling_export + res += '#endif\n' generate_file = sys.argv[3] diff --git a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h index 3c7507a6c9fcb8cf2ec32e33254b13227e5c7260..2d9b00fceb135d01774d604039acff0d76770885 100644 --- a/impl/activation/softmax/membase/common/simple_softmax_common_impl.h +++ b/impl/activation/softmax/membase/common/simple_softmax_common_impl.h @@ -308,17 +308,11 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % HALF_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, HALF_NUM_PER_BLK); - } Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK, + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE * HALF_FACTOR); PipeBarrier(); @@ -326,7 +320,7 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, splitK, + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE * HALF_FACTOR); PipeBarrier(); @@ -348,18 +342,12 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); - } - GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, splitK, + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Exp(dst[offset1], dst[offset1], splitSize); PipeBarrier(); - GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, splitK, + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); } } @@ -461,20 +449,14 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } else { - uint32_t splitK = 0; - if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { - splitK = config.oriSrcK; - } else { - splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); - } Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, splitK, + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, splitK, + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.splitK, DEFAULT_REPEAT_STRIDE); PipeBarrier(); Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); diff --git a/impl/sort/topk/topk_tiling_impl.cpp b/impl/sort/topk/topk_tiling_impl.cpp index 3f807cf248f7dfe80f5770eccdf6d09a13daef95..9f936a6d6af90f86b06fb4c590032bbfc8d2daea 100644 --- a/impl/sort/topk/topk_tiling_impl.cpp +++ b/impl/sort/topk/topk_tiling_impl.cpp @@ -368,9 +368,6 @@ void CheckTopKHostCommon(const char *apiName, const char *hostFuncName, if (mode == TopKMode::TOPK_NSMALL) { ASCENDC_HOST_ASSERT(inner == 32, return, "[%s][%s] In Small mode, the length of the inner axis must be 32!", apiName, hostFuncName); - } else if (mode == TopKMode::TOPK_NORMAL) { - ASCENDC_HOST_ASSERT(inner <= 4096, return, - "[%s][%s] In Normal mode, the maximum length of the inner axis is 4096!", apiName, hostFuncName); } if (socVersion == platform_ascendc::SocVersion::ASCEND310P && isInitIndex == false) { ASCENDC_HOST_ASSERT(inner <= 2048, return, diff --git a/tests/activation/gelu/test_operator_fast_gelu.cpp b/tests/activation/gelu/test_operator_fast_gelu.cpp index ec97dba56d061217fc444e4bd25bab50f1ea6932..d15dcfb2c4a31bd9e373677a88393ab42cce4fee 100644 --- a/tests/activation/gelu/test_operator_fast_gelu.cpp +++ b/tests/activation/gelu/test_operator_fast_gelu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp index f4d98b554d2c2d9ab61a536a195eeed455b6023d..6974cdd1c8eacffe24fcac855d8c0337a517ed13 100644 --- a/tests/activation/gelu/test_operator_fast_gelu_v2.cpp +++ b/tests/activation/gelu/test_operator_fast_gelu_v2.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/gelu/test_operator_gelu.cpp b/tests/activation/gelu/test_operator_gelu.cpp index 387727a056f0584137ba1e6466d7dbb1807d2ba2..c668d6597bc01a76add3d9e85a9e132575bd8828 100644 --- a/tests/activation/gelu/test_operator_gelu.cpp +++ b/tests/activation/gelu/test_operator_gelu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/silu/test_operator_silu.cpp b/tests/activation/silu/test_operator_silu.cpp index a001b04e73d6f689984a7ca600b8d143c9f95387..ad7eeb158f86d04cb2afbc619a9715162afa9ac8 100644 --- a/tests/activation/silu/test_operator_silu.cpp +++ b/tests/activation/silu/test_operator_silu.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp index 1261bd042ad621903ef3f8abe3d84aaaf0b9fc9c..c7eacc538ecfb0ac2c88cdf6e36cd7135dbf8b70 100644 --- a/tests/activation/softmax/test_operator_softmax_v220.cpp +++ b/tests/activation/softmax/test_operator_softmax_v220.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/softmax/test_operator_softmax_v300.cpp b/tests/activation/softmax/test_operator_softmax_v300.cpp index d154acdf180060c01319396dd77b909bc2ee26a4..c8e2e781c7164dd4414e97c61ed576e74ff19fd3 100644 --- a/tests/activation/softmax/test_operator_softmax_v300.cpp +++ b/tests/activation/softmax/test_operator_softmax_v300.cpp @@ -3,7 +3,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp index c93705e4926940553be53bd8e7c0b9ee402638cb..84f0888f9c907f1c8d6391470f5e83aa0fe9ffc1 100644 --- a/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp +++ b/tests/activation/softmax/test_operator_softmaxflashv3_v220.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/activation/swish/test_operator_swish.cpp b/tests/activation/swish/test_operator_swish.cpp index 9e9103f3cd5519a781e54d18e6756975a75f4ea0..e37a5b6612c488535dc1e24f7d7ff7f9565fb765 100644 --- a/tests/activation/swish/test_operator_swish.cpp +++ b/tests/activation/swish/test_operator_swish.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/filter/dropout/test_operator_dropout.cpp b/tests/filter/dropout/test_operator_dropout.cpp index 7e7b9f41252a6341455e5fe70a51ede1f6e5309f..32dd61b41013286ea126f13ac5a0d7dd1fb1315a 100644 --- a/tests/filter/dropout/test_operator_dropout.cpp +++ b/tests/filter/dropout/test_operator_dropout.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/index/arithprogression/test_operator_arithprogression.cpp b/tests/index/arithprogression/test_operator_arithprogression.cpp index ca52d8a259cce0e2604ac1478a1a2e3c02441c64..9ac8d53f59e46e767d4ef600b51a272cf4b84a8b 100644 --- a/tests/index/arithprogression/test_operator_arithprogression.cpp +++ b/tests/index/arithprogression/test_operator_arithprogression.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/math/exp/test_operator_exphighprecision.cpp b/tests/math/exp/test_operator_exphighprecision.cpp index 882199b26e51200bc12152568d5b1e52212662b7..e0bbe7a523c90aca86eba7455af634cf719380f0 100644 --- a/tests/math/exp/test_operator_exphighprecision.cpp +++ b/tests/math/exp/test_operator_exphighprecision.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" diff --git a/tests/normalization/batchnorm/test_operator_batchnorm.cpp b/tests/normalization/batchnorm/test_operator_batchnorm.cpp index e255c92698306af584bc1ba0cc8b7c58ead086ec..facee7f5e7df1fa866da2428d951fddcfa3dd670 100644 --- a/tests/normalization/batchnorm/test_operator_batchnorm.cpp +++ b/tests/normalization/batchnorm/test_operator_batchnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/deepnorm/test_operator_deepnorm.cpp b/tests/normalization/deepnorm/test_operator_deepnorm.cpp index bbc72b7f5227cac56de1d9388b540eb27ae73b2d..c6c1e9434a4fed95113083bceed6e68bfd001552 100644 --- a/tests/normalization/deepnorm/test_operator_deepnorm.cpp +++ b/tests/normalization/deepnorm/test_operator_deepnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp index 7dd522b972f24d7d4f178682dd2f454e4547dbcb..71f98669f6fd19bec6483fda46ef9001c95e2c13 100644 --- a/tests/normalization/groupnorm/test_operator_groupnorm.cpp +++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp @@ -15,7 +15,7 @@ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernorm.cpp b/tests/normalization/layernorm/test_operator_layernorm.cpp index 5875945b7fe71017d333fb3123530abe89257c8f..a3bc7836a767bb7dd097935fc32b3e4894ba57eb 100644 --- a/tests/normalization/layernorm/test_operator_layernorm.cpp +++ b/tests/normalization/layernorm/test_operator_layernorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernormgrad.cpp b/tests/normalization/layernorm/test_operator_layernormgrad.cpp index fab91ded70bd33167e92144e1895067c433b7541..81765f38a107560a4ae16e8f73eab92a01561ea6 100644 --- a/tests/normalization/layernorm/test_operator_layernormgrad.cpp +++ b/tests/normalization/layernorm/test_operator_layernormgrad.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp index 21dd3578e933f8f8532571601022a757841441fc..30d827cb4493f7cd7418bfef2cf8abd9fe89d16d 100644 --- a/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp +++ b/tests/normalization/layernorm/test_operator_layernormgradbeta.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/layernormV2/test_operator_layernormV2.cpp b/tests/normalization/layernormV2/test_operator_layernormV2.cpp index 0af89bd937016bc9d6292b3105f5ccb0e851be3e..fe25a875c04fac7e7ecc5b2eb4e5186db2f0d019 100644 --- a/tests/normalization/layernormV2/test_operator_layernormV2.cpp +++ b/tests/normalization/layernormV2/test_operator_layernormV2.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/normalize/test_operator_normalize.cpp b/tests/normalization/normalize/test_operator_normalize.cpp index fd975d90e6280eafe5326c407f9b53c6bd6c3253..17751b1d2b5a3e3f52f5f5e0e5aeb80c97ed31c5 100644 --- a/tests/normalization/normalize/test_operator_normalize.cpp +++ b/tests/normalization/normalize/test_operator_normalize.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp index 42d7072f77d66d95777e09a63ed4a4d94e2896fb..6cf9fb734b8288cd6e7658e14b79552baa6700f9 100644 --- a/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp +++ b/tests/normalization/rmsnorm/test_operator_rmsnorm.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" using namespace std; diff --git a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp index c5b08ec14784671fd8c33d8ef78da54de7f2366e..1411599de9f87ad10ec3f6fb26f9104701a513e2 100644 --- a/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp +++ b/tests/normalization/welfordfinalize/test_operator_welfordfinalize.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp index 654a40a667a8231fb4a12c60adfcbaf5dd40a81c..8833226c845bf6d758bfe39abe025742644ca1e5 100644 --- a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp +++ b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/pad/pad/test_operator_pad.cpp b/tests/pad/pad/test_operator_pad.cpp index 8bed6873f30260fd17816b70a1825cb3c9a05b17..f7e4e51c3d5eaa455c5692b8e8ca9cb29ba107da 100644 --- a/tests/pad/pad/test_operator_pad.cpp +++ b/tests/pad/pad/test_operator_pad.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/quantization/antiquant/test_ascend_antiquant.cpp b/tests/quantization/antiquant/test_ascend_antiquant.cpp index 105d9151283832a4a330ea8a35e8d870a0e1a398..af9b3a6c882c62f0318fbb4ee6e2385dc05509c1 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp index f631cf0e31203ba3e6307d60cefc5843da5abb85..1128ed3d8d1f9a095bc81cf01ce66c34408f8ca0 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_scalar.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp index d66a237374e4205ea6b590e98e738b2ad3bc9da5..1ebd44718ab6fd3394fc7cd12f315ccba81a933d 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_weight.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_operator_intf.h" #include "kernel_utils.h" diff --git a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp index 571c86edb00d98681df8db35ce553bcf87d52914..d4641a89de006e054a7a4a8559298614256cf4ba 100644 --- a/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp +++ b/tests/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" diff --git a/tests/reduce/mean/test_operator_mean.cpp b/tests/reduce/mean/test_operator_mean.cpp index eb96f1846f6254d1555e14431f06b7cb502963e3..cf457c2fb7ae83c9a1d5c18706c3b4c21374a222 100644 --- a/tests/reduce/mean/test_operator_mean.cpp +++ b/tests/reduce/mean/test_operator_mean.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include diff --git a/tests/reduce/sum/test_operator_sum.cpp b/tests/reduce/sum/test_operator_sum.cpp index 015bcf708c1472e8d0cf839e5c08c45d6eeec7f3..6ca219166e4b32853586f090f8e0f2db09e4169f 100644 --- a/tests/reduce/sum/test_operator_sum.cpp +++ b/tests/reduce/sum/test_operator_sum.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include diff --git a/tests/sort/topk/test_operator_topk.cpp b/tests/sort/topk/test_operator_topk.cpp index a1f80967198878df07c453eb10f2e1f2ecccc334..ab8a14f6e0cadf4e176031104a94a9dc7366e81c 100644 --- a/tests/sort/topk/test_operator_topk.cpp +++ b/tests/sort/topk/test_operator_topk.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include using namespace std; diff --git a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp index c0624cc1485f1048d5b197d97b4ee116fcdbd38e..f3aa42cd9f07db2927ec0cf7cc95689247a7ef8e 100644 --- a/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp +++ b/tests/transpose/confusion_transpose/test_operator_confusion_transpose.cpp @@ -9,7 +9,7 @@ */ #include #define private public -#define protect public +#define protected public #include "kernel_operator.h" #include "kernel_utils.h" #include