diff --git a/.gitee/PULL_REQUEST_TEMPLATE.md b/.gitee/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..e068cab73329100ffc48110d03a501f736595a38
--- /dev/null
+++ b/.gitee/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,24 @@
+<!--  Thanks for sending a pull request! 
+-->
+
+**What type of PR is this?**
+> Uncomment only one ` /kind <>` line, hit enter to put that in a new line, and remove leading whitespaces from that line:
+>
+> /kind bug
+> /kind task
+> /kind feature
+
+
+**What does this PR do / why do we need it**:
+
+
+**Which issue(s) this PR fixes**:
+<!-- 
+*Automatically closes linked issue when PR is merged.
+Usage: `Fixes #<issue number>`, or `Fixes (paste link of issue)`.
+-->
+Fixes #
+
+**Special notes for your reviewers**:
+
+
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 01dd62bcae5c4f7c074b0838be937578d4e2e9e2..0000000000000000000000000000000000000000
--- a/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-BSD 3-Clause License
-
-Copyright (c) 2021, 赵光灿
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000000000000000000000000000000000000..8c69af9528b4fec6e30616ab251a6f3cef208b94
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,9 @@
+approvers:
+- blian
+
+reviewers:
+- goodjobwubai
+- fairwarning
+- junqiang521
+- ascendzyj
+- zhou_sinan
diff --git a/README.zh.md b/README.zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..27fdf436fc7e03d92abc87316877df2904b08fe8
--- /dev/null
+++ b/README.zh.md
@@ -0,0 +1,53 @@
+# Ascend 910 版本apex安装使用说明
+
+
+## 生成全量代码及编译
+1、请确保npu版本的pytorch可以正常使用；
+
+2、进入到build目录，执行
+```
+bash build.sh
+```
+会在当前根目录下apex-npu目录中生成npu适配全量代码，同时在dist目录中生成whl包。执行
+```
+bash build.sh gen
+```
+只生成全量代码。
+
+
+## 安装
+进入dist目录，执行以下命令：
+```
+pip3 uninstall apex
+pip3 install --upgrade apex-0.1+ascend-cp37-cp37m-linux_{arch}.whl arch表示架构，为aarch64或x86_64
+```
+
+
+## 特性
+**已支持：**
+- [x] O1模式
+- [x] O2模式
+- [x] 静态 loss scale
+- [x] 动态 loss scale
+- [x] combine tensors
+- [x] combine grad for unscale
+- [x] npu fused optimizer: adadelta, adam, sgd, lamb
+
+**说明：**
+
+当前版本的实现方式为python实现，不支持acl或者cuda优化。
+
+
+## 使用方法
+**自动混合精度：**
+
+请参考https://nvidia.github.io/apex/amp.html
+
+**combine grad for unscale：**
+
+在amp.initialize()中将参数combine_grad设置为True
+
+**npu fused optimizer：**
+
+将原有优化器替换为apex.optimizers.xxx, 其中xxx为融合优化器名称
+
diff --git a/build/build.sh b/build/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ddddbe07ec63a5476e754e7288663158fc137f3c
--- /dev/null
+++ b/build/build.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+CUR_DIR=$(dirname $(readlink -f $0))
+ROOT_DIR=$CUR_DIR/..
+APEX_DIR=$ROOT_DIR/apex-npu
+APEX_PKG=apex.tar.gz
+
+function main()
+{
+    cd $ROOT_DIR || exit
+    # get ori apex
+    if [ -f $ROOT_DIR/$APEX_PKG ];then
+        echo "detect $APEX_PKG exist, skip download"
+    else
+        wget https://ascend-ptadapter.obs.cn-north-4.myhuaweicloud.com/pytorch-v1.5.0/$APEX_PKG --no-check-certificate
+    fi
+
+    if [ $? != 0 ]; then
+        echo "Failed to wget source code of apex, check network."
+        exit 1
+    fi
+
+    # mkdir apex
+    if [ -d $APEX_DIR ];then
+        echo "$APEX_DIR exists, if nothing to backup, please remove it"
+        exit 1
+    fi
+
+    # unpack
+    tar -xf $APEX_PKG
+    mv apex $APEX_DIR
+
+    echo "download and unpack $APEX_PKG success"
+
+    # patch
+    cp $ROOT_DIR/patch/npu.patch $APEX_DIR
+    cd $APEX_DIR || exit
+    patch -p1 <npu.patch
+    rm -rf npu.patch
+    cp -r $ROOT_DIR/src/* $APEX_DIR
+    cp -r $ROOT_DIR/tests/* $APEX_DIR/tests
+    
+    if [[ $1 = "gen" ]];then
+        exit 0
+    fi
+    # build
+    python3 setup.py --cpp_ext --npu_float_status bdist_wheel
+    cp $APEX_DIR/dist/apex-* $ROOT_DIR/dist
+}
+
+main "$@"
+
diff --git a/build/mv.sh b/build/mv.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c9f766ff2e5245f32e06411c8ace593afce9e5b6
--- /dev/null
+++ b/build/mv.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+cd .. || exit
+
+# mkdir src
+mkdir -p temp/src/apex/contrib/test
+mkdir -p temp/src/apex/optimizers
+
+mkdir -p temp/src/csrc
+
+mkdir -p temp/tests/L0/run_optimizers
+mkdir -p temp/tests/L1/common
+
+# move files
+mv apex-npu/apex/contrib/combine_tensors temp/src/apex/contrib
+mv apex-npu/apex/contrib/test/test_combine_tensors.py temp/src/apex/contrib/test
+mv apex-npu/apex/dump temp/src/apex
+mv apex-npu/apex/optimizers/lamb.py temp/src/apex/optimizers
+mv apex-npu/apex/optimizers/npu* temp/src/apex/optimizers
+
+mv apex-npu/csrc/combine_tensors temp/src/csrc
+mv apex-npu/csrc/npu_float_status temp/src/csrc
+
+mv apex-npu/tests/L0/run_optimizers/test_lamb_mnist.py temp/tests/L0/run_optimizers
+mv apex-npu/tests/L0/device.py temp/tests/L0
+mv apex-npu/tests/L1/common/compare_npu.py temp/tests/L1/common
+mv apex-npu/tests/L1/common/main_amp_npu.py temp/tests/L1/common
+mv apex-npu/tests/L1/common/run_test_npu.sh temp/tests/L1/common
+
+mv apex-npu/README.zh.md temp
+mv apex-npu/README.en.md temp
+mv apex-npu/OWNERS temp
+mv apex-npu/.gitee temp
+mv apex-npu/说明.md temp
+
diff --git a/dist/.gitkeep b/dist/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/patch/npu.patch b/patch/npu.patch
new file mode 100644
index 0000000000000000000000000000000000000000..aca48f0646fb5e541042db2ace9b9213f9aa5ead
--- /dev/null
+++ b/patch/npu.patch
@@ -0,0 +1,4683 @@
+diff -Nur '--exclude=.git' apex/.gitignore apex-npu/.gitignore
+--- apex/.gitignore	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/.gitignore	1970-01-01 00:00:00.000000000 +0000
+@@ -1,5 +0,0 @@
+-apex.egg-info
+-dist
+-build
+-docs/build
+-*~
+\ No newline at end of file
+diff -Nur '--exclude=.git' apex/.gitmodules apex-npu/.gitmodules
+--- apex/.gitmodules	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/.gitmodules	1970-01-01 00:00:00.000000000 +0000
+@@ -1,4 +0,0 @@
+-[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
+-	path = apex/contrib/csrc/multihead_attn/cutlass
+-	url = https://github.com/NVIDIA/cutlass.git
+-	branch = v1.2.0
+diff -Nur '--exclude=.git' apex/apex/amp/_initialize.py apex-npu/apex/amp/_initialize.py
+--- apex/apex/amp/_initialize.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/_initialize.py	2021-06-17 07:10:45.373711948 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import torch
+ from torch._six import string_classes
+ import functools
+@@ -20,9 +36,9 @@
+ 
+ def to_type(dtype, t):
+     if isinstance(t, torch.Tensor):
+-        if not t.is_cuda:
++        if not 'npu' in t.type():
+             # This should not be a hard error, since it may be legitimate.
+-            warnings.warn("An input tensor was not cuda.")
++            warnings.warn("An input tensor was not npu.")
+         # GANs require this.
+         # if t.requires_grad:
+         #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
+@@ -81,15 +97,15 @@
+         for name, param in model.named_parameters():
+             if param.is_floating_point():
+                 if 'Half' in param.type():
+-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
++                    warn_or_err("Found param {} with type {}, expected torch.npu.FloatTensor.\n"
+                         "When using amp.initialize, you do not need to call .half() on your model\n"
+                         "before passing it, no matter what optimization level you choose.".format(
+                         name, param.type()))
+-                elif not param.is_cuda:
+-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
++                elif not 'npu' in param.type():
++                    warn_or_err("Found param {} with type {}, expected torch.npu.FloatTensor.\n"
+                         "When using amp.initialize, you need to provide a model with parameters\n"
+-                        "located on a CUDA device before passing it no matter what optimization level\n"
+-                        "you chose. Use model.to('cuda') to use the default device.".format(
++                        "located on a Npu device before passing it no matter what optimization level\n"
++                        "you chose. Use model.to('npu') to use the default device.".format(
+                         name, param.type()))
+ 
+         # Backward compatibility for PyTorch 0.4
+@@ -104,15 +120,15 @@
+                 name, buf = obj, buf_iter[obj]
+             if buf.is_floating_point():
+                 if 'Half' in buf.type():
+-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
++                    warn_or_err("Found buffer {} with type {}, expected torch.npu.FloatTensor.\n"
+                         "When using amp.initialize, you do not need to call .half() on your model\n"
+                         "before passing it, no matter what optimization level you choose.".format(
+                         name, buf.type()))
+-                elif not buf.is_cuda:
+-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
++                elif not 'npu' in buf.type():
++                    warn_or_err("Found buffer {} with type {}, expected torch.npu.FloatTensor.\n"
+                         "When using amp.initialize, you need to provide a model with buffers\n"
+-                        "located on a CUDA device before passing it no matter what optimization level\n"
+-                        "you chose. Use model.to('cuda') to use the default device.".format(
++                        "located on a Npu device before passing it no matter what optimization level\n"
++                        "you chose. Use model.to('npu') to use the default device.".format(
+                         name, buf.type()))
+ 
+ 
+diff -Nur '--exclude=.git' apex/apex/amp/_process_optimizer.py apex-npu/apex/amp/_process_optimizer.py
+--- apex/apex/amp/_process_optimizer.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/_process_optimizer.py	2021-06-17 07:10:45.373711948 +0000
+@@ -1,9 +1,52 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import types
+ from ..fp16_utils import master_params_to_model_params
+ from ..multi_tensor_apply import multi_tensor_applier
+ from ._amp_state import maybe_print
+ import torch
+ from ..optimizers import FusedSGD
++from ..contrib.combine_tensors import combine_npu, get_part_combined_tensor, is_combined_tensor_valid
++
++
++def get_grad_combined_tensor_from_param(list_of_params):
++    if len(list_of_params) > 0 and list_of_params[0].grad is not None:
++        list_of_grad = []
++        for param in list_of_params:
++            if param.requires_grad:
++                list_of_grad.append(param.grad)
++        original_combined_tensor = combine_npu(list_of_grad)
++        return original_combined_tensor, list_of_grad
++    else:
++        return None, []
++
++
++def get_grad_combined_tensor_mask_from_param(list_of_params):
++    if len(list_of_params) > 0 and list_of_params[0].grad is not None:
++        list_of_grad_mask = []
++        for param in list_of_params:
++            if param.requires_grad:
++                grad_size = param.grad.size()
++                grad_format = param.storage().npu_format()
++                list_of_grad_mask.append(torch.ones(grad_size).npu().npu_format_cast(grad_format))
++        grad_combined_tensor_mask = combine_npu(list_of_grad_mask)
++        return grad_combined_tensor_mask
++    else:
++        return None
+ 
+ 
+ class AmpOptimizerState(object):
+@@ -26,96 +69,114 @@
+ 
+ 
+ def lazy_init_with_master_weights(self):
+-        stash = self._amp_stash
+-        stash.fp16_groups = []
+-        stash.fp32_from_fp16_groups = []
+-        stash.fp32_from_fp32_groups = []
+-        for i, param_group in enumerate(self.param_groups):
+-            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
+-            fp16_params_this_group = []
+-            fp32_params_this_group = []
+-            fp32_from_fp16_params_this_group = []
+-            for i, param in enumerate(param_group['params']):
+-                if param.requires_grad:
+-                    if param.type() == 'torch.cuda.HalfTensor':
+-                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+-                        #             .format(param.size()))
+-                        fp16_params_this_group.append(param)
+-                        master_param = param.detach().clone().float()
+-                        master_param.requires_grad = True
+-                        param_group['params'][i] = master_param
+-                        fp32_from_fp16_params_this_group.append(master_param)
+-                        # Reset existing state dict key to the new master param.
+-                        # We still need to recast per-param state tensors, if any, to FP32.
+-                        if param in self.state:
+-                           self.state[master_param] = self.state.pop(param)
+-                    elif param.type() == 'torch.cuda.FloatTensor':
+-                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+-                        #             .format(param.size()))
+-                        fp32_params_this_group.append(param)
+-                        param_group['params'][i] = param
+-                    else:
+-                        raise TypeError("Optimizer's parameters must be either "
+-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+-                                        "Received {}".format(param.type()))
+-
+-            stash.fp16_groups.append(fp16_params_this_group)
+-            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+-            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
++    stash = self._amp_stash
++    stash.fp16_groups = []
++    stash.fp32_from_fp16_groups = []
++    stash.fp32_from_fp32_groups = []
++    for i, param_group in enumerate(self.param_groups):
++        # maybe_print("FP16_Optimizer processing param group {}:".format(i))
++        fp16_params_this_group = []
++        fp32_params_this_group = []
++        fp32_from_fp16_params_this_group = []
++        for i, param in enumerate(param_group['params']):
++            if param.requires_grad:
++                if param.type() == 'torch.npu.HalfTensor':
++                    # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
++                    #             .format(param.size()))
++                    fp16_params_this_group.append(param)
++                    master_param = param.detach().clone().float()
++                    master_param.requires_grad = True
++                    param_group['params'][i] = master_param
++                    fp32_from_fp16_params_this_group.append(master_param)
++                    # Reset existing state dict key to the new master param.
++                    # We still need to recast per-param state tensors, if any, to FP32.
++                    if param in self.state:
++                        self.state[master_param] = self.state.pop(param)
++                elif param.type() == 'torch.npu.FloatTensor':
++                    # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
++                    #             .format(param.size()))
++                    fp32_params_this_group.append(param)
++                    param_group['params'][i] = param
++                else:
++                    raise TypeError("Optimizer's parameters must be either "
++                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
++                                    "Received {}".format(param.type()))
++
++        stash.fp16_groups.append(fp16_params_this_group)
++        stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
++        stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+ 
+-        stash.all_fp16_params = []
+-        for group in stash.fp16_groups:
+-            stash.all_fp16_params += group
+-
+-        stash.all_fp32_from_fp16_params = []
+-        for group in stash.fp32_from_fp16_groups:
+-            stash.all_fp32_from_fp16_params += group
+-
+-        stash.all_fp32_from_fp32_params = []
+-        for group in stash.fp32_from_fp32_groups:
+-            stash.all_fp32_from_fp32_params += group
+-
+-        # all_fp16_grad_stash is only needed for fused optimizers.
+-        stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+-        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+-        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
++    stash.all_fp16_params = []
++    for group in stash.fp16_groups:
++        stash.all_fp16_params += group
+ 
+-        for param in stash.all_fp32_from_fp16_params:
+-            param.grad = None
++    stash.all_fp32_from_fp16_params = []
++    for group in stash.fp32_from_fp16_groups:
++        stash.all_fp32_from_fp16_params += group
++
++    stash.all_fp32_from_fp32_params = []
++    for group in stash.fp32_from_fp32_groups:
++        stash.all_fp32_from_fp32_params += group
+ 
+-        for param in stash.all_fp32_from_fp32_params:
+-            param.grad = None
++    # all_fp16_grad_stash is only needed for fused optimizers.
++    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
++    # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
++    stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
+ 
+-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+-        self.load_state_dict(self.state_dict())
++    for param in stash.all_fp32_from_fp16_params:
++        param.grad = None
+ 
++    for param in stash.all_fp32_from_fp32_params:
++        param.grad = None
++    
++    stash.main_fp16_grad_combine = None
++    stash.main_fp32_from_fp16_grad_combine = None
++    stash.main_fp32_from_fp32_grad_combine = None
++    stash.main_fp32_from_fp16_grad_combine_mask = None
++    stash.main_fp32_from_fp32_grad_combine_mask = None
++
++    stash.all_fp32_from_fp32_grad_stash_combine = None
++
++    stash.main_fp16_param_combine = None
++    stash.main_fp32_from_fp16_param_combine = None
++    stash.main_fp32_from_fp32_param_combine = None
++    # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
++    self.load_state_dict(self.state_dict())
++
++
++def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None, 
++                                     main_grads_combined=None, stashed_grads_combined=None, 
++                                     use_npu_fused_optimizer=False, stashed_grads_are_zero=False):
++    grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
+ 
+-def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
+-        grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
++    # not much to do if scale == 1.0 and static scaling
++    if scaler.loss_scale() == 1.0 and not scaler.dynamic:
++        # Clear the stash.
++        for i in range(len(stashed_grads)):
++            stashed_grads[i] = None
++        return
+ 
+-        # not much to do if scale == 1.0 and static scaling
+-        if scaler.loss_scale() == 1.0 and not scaler.dynamic:
+-            # Clear the stash.
+-            for i in range(len(stashed_grads)):
+-                stashed_grads[i] = None
+-            return
+-        
+-        if scale_override is not None:
+-            grads_have_scale, stashed_have_scale, out_scale = scale_override
++    if scale_override is not None:
++        grads_have_scale, stashed_have_scale, out_scale = scale_override
+ 
+-        # This is a lot of python overhead...
++    # This is a lot of python overhead...
++    if main_grads_combined is not None:
++        scaler.unscale_with_stashed_combined(
++            main_grads_combined, stashed_grads_combined,
++            scale_override=(grads_have_scale, stashed_have_scale, out_scale))
++    else:
+         grads_needing_unscale = []
+         grads_needing_unscale_with_stash = []
+         stashed = []
+         for param, stashed_grad in zip(params, stashed_grads):
+             if param.grad is None and stashed_grad is not None:
+                 param.grad = stashed_grad
+-            elif param.grad is not None and stashed_grad is None:
++            elif param.grad is not None and (stashed_grad is None or stashed_grads_are_zero):
+                 grads_needing_unscale.append(param.grad)
+             elif param.grad is not None and stashed_grad is not None:
+                 grads_needing_unscale_with_stash.append(param.grad)
+                 stashed.append(stashed_grad)
+-            else: # param.grad is None and stashed_grad is None
++            else:  # param.grad is None and stashed_grad is None
+                 continue
+ 
+         # unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
+@@ -123,130 +184,259 @@
+             scaler.unscale(
+                 grads_needing_unscale,
+                 grads_needing_unscale,
+-                None, # unused_scale, currently present to avoid API breakage elsewhere
++                None,  # unused_scale, currently present to avoid API breakage elsewhere
+                 models_are_masters=True,
+-                scale_override=grads_have_scale/out_scale)
++                scale_override=grads_have_scale / out_scale)
+ 
+         if len(grads_needing_unscale_with_stash) > 0:
+             scaler.unscale_with_stashed(
+                 grads_needing_unscale_with_stash,
+                 stashed,
+                 grads_needing_unscale_with_stash,
+-                scale_override=(grads_have_scale, stashed_have_scale, out_scale))
++                scale_override=(grads_have_scale, stashed_have_scale, out_scale),
++                use_npu_fused_optimizer=use_npu_fused_optimizer)
+ 
+-        # Clear the stash.
+-        for i in range(len(stashed_grads)):
+-            stashed_grads[i] = None
++        if not use_npu_fused_optimizer:
++            # Clear the stash.
++            for i in range(len(stashed_grads)):
++                stashed_grads[i] = None
+ 
+ 
+ def prepare_backward_with_master_weights(self):
+     stash = self._amp_stash
+ 
+     self._amp_lazy_init()
++    self._check_already_combined_params_and_grads()
+ 
+-    for i, param in enumerate(stash.all_fp16_params):
+-        # Set up to leverage grad copy elision.
+-        # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
+-        param.grad = None
++    if (self.accelerate or self.is_npu_fused_optimizer) and stash.already_combined:
++        if stash.process_zero_grad:
++            return
++
++        if stash.main_fp16_grad_combine is not None:
++            stash.main_fp16_grad_combine.zero_()
++
++        if stash.main_fp32_from_fp32_grad_combine is not None:
++            stash.all_fp32_from_fp32_grad_stash_combine.copy_(stash.main_fp32_from_fp32_grad_combine)
++            stash.main_fp32_from_fp32_grad_combine.zero_()
++    else:
++        for i, param in enumerate(stash.all_fp16_params):
++            # Set up to leverage grad copy elision.
++            # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
++            param.grad = None
++
++        # for i, param in enumerate(stash.all_fp32_from_fp16_params):
++        #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
++
++        for i, param in enumerate(stash.all_fp32_from_fp32_params):
++            stash.all_fp32_from_fp32_grad_stash[i] = param.grad
++            # Set up to leverage grad copy elision:
++            param.grad = None
++
++
++@torch.no_grad()
++def combined_init_with_master_weights(self):
++    stash = self._amp_stash
++    if stash.already_combined:
++        return
+ 
+-    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
+-    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
++    if (not self.accelerate) and (not self.is_npu_fused_optimizer):
++        return
+ 
+     for i, param in enumerate(stash.all_fp32_from_fp32_params):
+-        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+-        # Set up to leverage grad copy elision:
+-        param.grad = None
++        if param.grad is not None:
++            stash.all_fp32_from_fp32_grad_stash[i] = torch.zeros_like(param.grad)
++
++    if len(stash.all_fp32_from_fp32_grad_stash) > 0:
++        stash.all_fp32_from_fp32_grad_stash_combine = combine_npu(stash.all_fp32_from_fp32_grad_stash)
++
++    all_fp16_params, all_fp32_from_fp16_params = [], []
++    for fp16_param, fp32_from_fp16_param in zip(stash.all_fp16_params, stash.all_fp32_from_fp16_params):
++        if fp16_param.grad is not None:
++            all_fp16_params.append(fp16_param)
++            all_fp32_from_fp16_params.append(fp32_from_fp16_param)
++    stash.all_fp16_params = all_fp16_params
++    stash.all_fp32_from_fp16_params = all_fp32_from_fp16_params
++
++    stash.main_fp16_grad_combine, stash.fp16_grad_list = get_grad_combined_tensor_from_param(stash.all_fp16_params)
++
++    for fp16_grad, fp32_from_fp16_param in zip(stash.fp16_grad_list, stash.all_fp32_from_fp16_params):
++        if fp16_grad.storage().npu_format() == fp32_from_fp16_param.storage().npu_format():
++            fp32_from_fp16_param.grad = torch.zeros_like(fp32_from_fp16_param)
++        else:
++            fp32_from_fp16_param.grad = torch.zeros_like(fp16_grad.to(torch.float))
++            fp32_from_fp16_param.data = fp32_from_fp16_param.data.npu_format_cast(fp16_grad.storage().npu_format())
++
++    stash.main_fp32_from_fp16_grad_combine, stash.fp32_from_fp16_grad_list = \
++        get_grad_combined_tensor_from_param(stash.all_fp32_from_fp16_params)
++    stash.main_fp32_from_fp32_grad_combine, stash.fp32_from_fp32_grad_list = \
++        get_grad_combined_tensor_from_param(stash.all_fp32_from_fp32_params)
++    # please do not change the order of tensor in this list.
++    stash.grads_list = [stash.main_fp16_grad_combine, 
++                        stash.main_fp32_from_fp16_grad_combine, 
++                        stash.main_fp32_from_fp32_grad_combine]
++
++    if self.is_npu_fused_optimizer:
++        # stash.main_fp16_param_combine = combine_npu(stash.all_fp16_params)
++        stash.main_fp32_from_fp16_param_combine = combine_npu(stash.all_fp32_from_fp16_params)
++        stash.main_fp32_from_fp32_param_combine = combine_npu(stash.all_fp32_from_fp32_params)
++    
++    stash.already_combined = True
+ 
+ 
+ def post_backward_with_master_weights(self, scaler):
+     stash = self._amp_stash
+ 
+     self._amp_lazy_init()
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
+ 
+-    # This is a lot of python overhead...
+-    fp16_grads_needing_unscale = []
+-    new_fp32_grads = []
+-    fp16_grads_needing_unscale_with_stash = []
+-    preexisting_fp32_grads = []
+-    for fp16_param, fp32_param in zip(stash.all_fp16_params,
+-                                      stash.all_fp32_from_fp16_params):
+-        if fp16_param.grad is None and fp32_param.grad is not None:
+-            continue
+-        elif fp16_param.grad is not None and fp32_param.grad is None:
+-            fp32_param.grad = torch.empty_like(fp32_param)
+-            fp16_grads_needing_unscale.append(fp16_param.grad)
+-            new_fp32_grads.append(fp32_param.grad)
+-        elif fp16_param.grad is not None and fp32_param.grad is not None:
+-            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
+-            preexisting_fp32_grads.append(fp32_param.grad)
+-        else: # fp16_param.grad is None and fp32_param.grad is None:
+-            continue
+-
+-    if len(fp16_grads_needing_unscale) > 0:
+-        scaler.unscale(
+-            fp16_grads_needing_unscale,
+-            new_fp32_grads,
+-            scaler.loss_scale(),
+-            models_are_masters=False)
+-
+-    if len(fp16_grads_needing_unscale_with_stash) > 0:
+-        scaler.unscale_with_stashed(
+-            fp16_grads_needing_unscale_with_stash,
+-            preexisting_fp32_grads,
+-            preexisting_fp32_grads)
+-
+-    # fp32 params can be treated as they would be in the "no_master_weights" case.
+-    post_backward_models_are_masters(
+-        scaler,
+-        stash.all_fp32_from_fp32_params,
+-        stash.all_fp32_from_fp32_grad_stash)
++    if self.accelerate:
++        scaler.unscale_grad_O2(
++            model_grads_combined=stash.main_fp16_grad_combine,
++            stashed_master_grads_combined=stash.main_fp32_from_fp16_grad_combine if not stash.process_zero_grad else None,
++            master_grads_combined=stash.main_fp32_from_fp16_grad_combine,
++            master_grads=stash.fp32_from_fp16_grad_list,
++            model_grads=stash.fp16_grad_list)
++        if stash.main_fp32_from_fp32_grad_combine is not None:
++            scaler.unscale_grad_O2(
++                model_grads_combined=stash.main_fp32_from_fp32_grad_combine,
++                stashed_master_grads_combined=stash.all_fp32_from_fp32_grad_stash_combine if not stash.process_zero_grad else None,
++                master_grads_combined=stash.main_fp32_from_fp32_grad_combine)
++    else:
++        # This is a lot of python overhead...
++        fp16_grads_needing_unscale = []
++        new_fp32_grads = []
++        fp16_grads_needing_unscale_with_stash = []
++        preexisting_fp32_grads = []
++        for fp16_param, fp32_param in zip(stash.all_fp16_params,
++                                          stash.all_fp32_from_fp16_params):
++            if fp16_param.grad is None and fp32_param.grad is not None:
++                continue
++            elif fp16_param.grad is not None and fp32_param.grad is None:
++                fp32_param.grad = torch.empty_like(fp32_param)
++                fp16_grads_needing_unscale.append(fp16_param.grad)
++                new_fp32_grads.append(fp32_param.grad)
++            elif fp16_param.grad is not None and fp32_param.grad is not None:
++                if stash.process_zero_grad:
++                    fp16_grads_needing_unscale.append(fp16_param.grad)
++                    new_fp32_grads.append(fp32_param.grad)
++                else:
++                    fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
++                    preexisting_fp32_grads.append(fp32_param.grad)
++            else: # fp16_param.grad is None and fp32_param.grad is None:
++                continue
++
++        if len(fp16_grads_needing_unscale) > 0:
++            scaler.unscale(
++                fp16_grads_needing_unscale,
++                new_fp32_grads,
++                scaler.loss_scale(),
++                models_are_masters=False)
++
++        if len(fp16_grads_needing_unscale_with_stash) > 0:
++            scaler.unscale_with_stashed(
++                fp16_grads_needing_unscale_with_stash,
++                preexisting_fp32_grads,
++                preexisting_fp32_grads,
++                use_npu_fused_optimizer=self.is_npu_fused_optimizer)
++
++        # fp32 params can be treated as they would be in the "no_master_weights" case.
++        post_backward_models_are_masters(
++            scaler,
++            stash.all_fp32_from_fp32_params,
++            stash.all_fp32_from_fp32_grad_stash,
++            use_npu_fused_optimizer=self.is_npu_fused_optimizer,
++            stashed_grads_are_zero=stash.process_zero_grad)
++    
++    stash.process_zero_grad = False
+ 
+ 
+ def lazy_init_no_master_weights(self):
+     stash = self._amp_stash
+     stash.all_fp16_params = []
+     stash.all_fp32_params = []
++
++    check_param_require_grad = self.accelerate or self.is_npu_fused_optimizer
++
+     for i, param_group in enumerate(self.param_groups):
+         for i, param in enumerate(param_group['params']):
+-            if param.type() == 'torch.cuda.HalfTensor':
++            if check_param_require_grad and not param.requires_grad:
++                continue
++
++            if param.type() == 'torch.npu.HalfTensor':
+                 stash.all_fp16_params.append(param)
+-            elif param.type() == 'torch.cuda.FloatTensor':
++            elif param.type() == 'torch.npu.FloatTensor':
+                 stash.all_fp32_params.append(param)
+             else:
+                 raise TypeError("Optimizer's parameters must be either "
+-                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
++                                "torch.npu.FloatTensor or torch.npu.HalfTensor."
+                                 "Received {}".format(param.type()))
+ 
+     stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+     stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
+ 
++    stash.all_fp16_grad_stash_combine = None
++    stash.all_fp32_grad_stash_combine = None
++
++    stash.fp16_grad_list = []
++    stash.main_fp16_grad_combine = None
++    stash.main_fp16_grad_combine_mask = None
++
++    stash.fp32_grad_list = []
++    stash.main_fp32_grad_combine = None
++    stash.main_fp32_grad_combine_mask = None
++
++    stash.main_fp16_param_combine = None
++    stash.main_fp32_param_combine = None
++
+ 
+ def prepare_backward_no_master_weights(self):
+     stash = self._amp_stash
+ 
+     self._amp_lazy_init()
++    self._check_already_combined_params_and_grads()
+ 
+-    for i, param in enumerate(stash.all_fp16_params):
+-        stash.all_fp16_grad_stash[i] = param.grad
+-        # Set up to leverage grad copy elision:
+-        param.grad = None
++    if (self.accelerate or self.is_npu_fused_optimizer) and stash.already_combined:
++        if stash.main_fp16_grad_combine is not None:
++            stash.all_fp16_grad_stash_combine.copy_(stash.main_fp16_grad_combine)
++            stash.main_fp16_grad_combine.zero_()
++        if stash.main_fp32_grad_combine is not None:
++            stash.all_fp32_grad_stash_combine.copy_(stash.main_fp32_grad_combine)
++            stash.main_fp32_grad_combine.zero_()
++    else:
++        for i, param in enumerate(stash.all_fp16_params):
++            stash.all_fp16_grad_stash[i] = param.grad
++            # Set up to leverage grad copy elision:
++            param.grad = None
+ 
+-    for i, param in enumerate(stash.all_fp32_params):
+-        stash.all_fp32_grad_stash[i] = param.grad
+-        # Set up to leverage grad copy elision:
+-        param.grad = None
++        for i, param in enumerate(stash.all_fp32_params):
++            stash.all_fp32_grad_stash[i] = param.grad
++            # Set up to leverage grad copy elision:
++            param.grad = None
+ 
+ 
+ def post_backward_no_master_weights(self, scaler):
+     stash = self._amp_stash
+ 
+     self._amp_lazy_init()
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
+ 
+-    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+-             (stash.all_fp32_params, stash.all_fp32_grad_stash))
++    if self.accelerate:
++        split_types = ((stash.main_fp16_grad_combine, stash.all_fp16_grad_stash_combine),
++                (stash.main_fp32_grad_combine, stash.all_fp32_grad_stash_combine))
++        for main_grads_combined, stash_grads_combined  in split_types:
++            if main_grads_combined is not None:
++                post_backward_models_are_masters(scaler, None, None, None, 
++                                                 main_grads_combined, stash_grads_combined,
++                                                 use_npu_fused_optimizer=self.is_npu_fused_optimizer)
++    else:
++        split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
++                 (stash.all_fp32_params, stash.all_fp32_grad_stash))
+ 
+-    for params, stashed_grads in split_types:
+-        post_backward_models_are_masters(scaler, params, stashed_grads)
++        for params, stashed_grads in split_types:
++            post_backward_models_are_masters(scaler, params, stashed_grads, 
++                                             use_npu_fused_optimizer=self.is_npu_fused_optimizer)
+ 
+ 
+ #####################################################################################
+@@ -318,6 +508,420 @@
+         stash.lazy_init_called = True
+ 
+ 
++@torch.no_grad()
++def combined_init_no_master_weights(self):
++    stash = self._amp_stash
++    if stash.already_combined:
++        return
++
++    if (not self.accelerate) and (not self.is_npu_fused_optimizer):
++        return
++
++    all_fp16_params, all_fp16_grad_stash = [], []
++    for param in stash.all_fp16_params:
++        if param.grad is not None:
++            all_fp16_params.append(param)
++            all_fp16_grad_stash.append(torch.zeros_like(param.grad))
++
++    stash.all_fp16_params = all_fp16_params
++    stash.all_fp16_grad_stash = all_fp16_grad_stash
++
++    all_fp32_params, all_fp32_grad_stash = [], []
++    for param in stash.all_fp32_params:
++        if param.grad is not None:
++            all_fp32_params.append(param)
++            all_fp32_grad_stash.append(torch.zeros_like(param.grad))
++
++    stash.all_fp32_params = all_fp32_params
++    stash.all_fp32_grad_stash = all_fp32_grad_stash
++
++    if len(stash.all_fp16_grad_stash) > 0:
++        # if len == 0, avoid to create a useless combined tensor
++        stash.all_fp16_grad_stash_combine = combine_npu(stash.all_fp16_grad_stash, require_copy_value=False)
++    if len(stash.all_fp32_grad_stash) > 0:
++        stash.all_fp32_grad_stash_combine = combine_npu(stash.all_fp32_grad_stash, require_copy_value=False)
++
++    stash.main_fp16_grad_combine, stash.fp16_grad_list = get_grad_combined_tensor_from_param(stash.all_fp16_params)
++    stash.main_fp32_grad_combine, stash.fp32_grad_list = get_grad_combined_tensor_from_param(stash.all_fp32_params)
++    # please do not change the order of tensor in this list.
++    stash.grads_list = [stash.main_fp16_grad_combine, stash.main_fp32_grad_combine]
++
++    if self.is_npu_fused_optimizer:
++        # stash.main_fp16_param_combine = combine_npu(stash.all_fp16_params)
++        stash.main_fp32_param_combine = combine_npu(stash.all_fp32_params)
++
++    stash.already_combined = True
++
++
++def reset_all_combine_flags(self):
++    stash = self._amp_stash
++    stash.already_combined = False
++    stash.params_grads_are_combined_by_group = False
++    stash.param_states_are_combined_by_group = False
++
++
++def check_already_combined_params_and_grads_no_master_weights(self):
++    stash = self._amp_stash
++    if not self.check_combined_tensors or not stash.already_combined:
++        return
++
++    if not is_combined_tensor_valid(stash.main_fp16_grad_combine, stash.fp16_grad_list) or \
++        not is_combined_tensor_valid(stash.main_fp32_grad_combine, stash.fp32_grad_list):
++        maybe_print("Combined grad has been destroyed and will be recombined afterwards, please check if "
++                    "there is any operation that may change the data_ptr/size/format of the grads.")
++        self._reset_all_combine_flags()
++        return
++
++    if self.is_npu_fused_optimizer:
++        if not is_combined_tensor_valid(stash.main_fp32_param_combine, stash.all_fp32_params):
++            maybe_print("Combined param has been destroyed and will be recombined afterwards, please check if "
++                        "there is any operation that may change the data_ptr/size/format of the params.")
++            self._reset_all_combine_flags()
++            return
++
++
++def check_already_combined_params_and_grads_with_master_weights(self):
++    stash = self._amp_stash
++    if not self.check_combined_tensors or not stash.already_combined:
++        return
++
++    if not is_combined_tensor_valid(stash.main_fp16_grad_combine, stash.fp16_grad_list) or \
++        not is_combined_tensor_valid(stash.main_fp32_from_fp32_grad_combine, stash.fp32_from_fp32_grad_list):
++        maybe_print("Combined grad has been destroyed and will be recombined afterwards, please check if "
++                    "there is any operation that may change the data_ptr/size/format of the grads.")
++        self._reset_all_combine_flags()
++        return
++
++    if self.is_npu_fused_optimizer:
++        if not is_combined_tensor_valid(stash.main_fp32_from_fp32_param_combine, stash.all_fp32_from_fp32_params):
++            maybe_print("Combined param has been destroyed and will be recombined afterwards, please check if "
++                        "there is any operation that may change the data_ptr/size/format of the params.")
++            self._reset_all_combine_flags()
++            return
++
++
++def is_grad_in_combined_tensor(grad, combined_tensor):
++    if combined_tensor is None:
++        return False
++
++    combined_tensor_data_start_addr = combined_tensor.data_ptr()
++    combined_tensor_data_end_addr = combined_tensor.data_ptr() + \
++                                    combined_tensor.numel() * combined_tensor.element_size()
++    
++    if combined_tensor_data_start_addr <= grad.data_ptr() < combined_tensor_data_end_addr:
++        return True
++    else:
++        return False
++
++
++def combine_params_and_grads_by_group_no_master_weights(self):
++    stash = self._amp_stash
++    if stash.params_grads_are_combined_by_group:
++        return
++
++    self._amp_combined_init()
++    stash.combined_params_indexed_by_group = []
++    stash.combined_grads_indexed_by_group = []
++    stash.params_lists_indexed_by_group = []
++
++    combined_fp32_param = stash.main_fp32_param_combine
++    combined_fp32_grad = stash.main_fp32_grad_combine
++
++    combined_group_fp32_param_index = 0
++    combined_group_fp32_grad_index = 0
++
++    group_num = 0
++    for group in self.param_groups:
++        group_num += 1
++
++        group_fp32_params = []
++        group_fp32_param_size = 0
++        group_fp32_grad_size = 0
++
++        for p in group['params']:
++            if p.grad is None:
++                continue
++
++            param_size = p.storage().size()
++            group_fp32_param_size += param_size
++            group_fp32_params.append(p)
++
++            grad_size = p.grad.storage().size()
++            group_fp32_grad_size += grad_size
++
++        combined_group_fp32_param = None
++        combined_group_fp32_grad = None
++        combined_group_fp32_param = get_part_combined_tensor(combined_fp32_param, 
++                                                             combined_group_fp32_param_index,
++                                                             group_fp32_param_size)
++        combined_group_fp32_grad = get_part_combined_tensor(combined_fp32_grad, 
++                                                            combined_group_fp32_grad_index, 
++                                                            group_fp32_grad_size)
++        combined_group_fp32_param_index += group_fp32_param_size
++        combined_group_fp32_grad_index += group_fp32_grad_size
++
++        combined_params = []
++        combined_grads = []
++        params_list = []
++
++        combined_params.append(combined_group_fp32_param)
++        combined_grads.append(combined_group_fp32_grad)
++        params_list.append(group_fp32_params)
++
++        stash.combined_params_indexed_by_group.append(combined_params)
++        stash.combined_grads_indexed_by_group.append(combined_grads)
++        stash.params_lists_indexed_by_group.append(params_list)
++
++    maybe_print("group num: {}".format(group_num))
++    stash.params_grads_are_combined_by_group = True
++
++
++def combine_params_and_grads_by_group_with_master_weights(self):
++    stash = self._amp_stash
++    if stash.params_grads_are_combined_by_group:
++        return
++
++    self._amp_combined_init()
++    stash.combined_params_indexed_by_group = []
++    stash.combined_grads_indexed_by_group = []
++    stash.params_lists_indexed_by_group = []
++
++    combined_fp32_from_fp32_param = stash.main_fp32_from_fp32_param_combine
++    combined_fp32_from_fp16_param = stash.main_fp32_from_fp16_param_combine
++    combined_fp32_from_fp32_grad = stash.main_fp32_from_fp32_grad_combine
++    combined_fp32_from_fp16_grad = stash.main_fp32_from_fp16_grad_combine
++
++    combined_group_fp32_from_fp32_param_index, combined_group_fp32_from_fp16_param_index = 0, 0
++    combined_group_fp32_from_fp32_grad_index, combined_group_fp32_from_fp16_grad_index = 0, 0
++
++    group_num = 0
++    for group in self.param_groups:
++        group_num += 1
++
++        group_fp32_from_fp32_params = []
++        group_fp32_from_fp16_params = []
++        group_fp32_from_fp32_param_size, group_fp32_from_fp16_param_size = 0, 0
++        group_fp32_from_fp32_grad_size, group_fp32_from_fp16_grad_size = 0, 0
++
++        for p in group['params']:
++            if p.grad is None:
++                continue
++
++            param_size = p.storage().size()
++            grad_size = p.grad.storage().size()
++            if is_grad_in_combined_tensor(p.grad, combined_fp32_from_fp32_grad):
++                group_fp32_from_fp32_param_size += param_size
++                group_fp32_from_fp32_params.append(p)
++                group_fp32_from_fp32_grad_size += grad_size
++            else:
++                group_fp32_from_fp16_param_size += param_size
++                group_fp32_from_fp16_params.append(p)
++                group_fp32_from_fp16_grad_size += grad_size
++
++        combined_group_fp32_from_fp32_param = None
++        combined_group_fp32_from_fp16_param = None
++        combined_group_fp32_from_fp32_grad = None
++        combined_group_fp32_from_fp16_grad = None
++
++        combined_group_fp32_from_fp32_param = get_part_combined_tensor(combined_fp32_from_fp32_param,
++                                                                       combined_group_fp32_from_fp32_param_index,
++                                                                       group_fp32_from_fp32_param_size)
++        combined_group_fp32_from_fp16_param = get_part_combined_tensor(combined_fp32_from_fp16_param,
++                                                                       combined_group_fp32_from_fp16_param_index,
++                                                                       group_fp32_from_fp16_param_size)
++        combined_group_fp32_from_fp32_grad = get_part_combined_tensor(combined_fp32_from_fp32_grad, 
++                                                                      combined_group_fp32_from_fp32_grad_index,
++                                                                      group_fp32_from_fp32_grad_size)
++        combined_group_fp32_from_fp16_grad = get_part_combined_tensor(combined_fp32_from_fp16_grad, 
++                                                                      combined_group_fp32_from_fp16_grad_index,
++                                                                      group_fp32_from_fp16_grad_size)
++
++        combined_group_fp32_from_fp32_param_index += group_fp32_from_fp32_param_size
++        combined_group_fp32_from_fp16_param_index += group_fp32_from_fp16_param_size
++        combined_group_fp32_from_fp32_grad_index += group_fp32_from_fp32_grad_size
++        combined_group_fp32_from_fp16_grad_index += group_fp32_from_fp16_grad_size
++
++        combined_params = []
++        combined_grads = []
++        params_list = []
++
++        combined_params.append(combined_group_fp32_from_fp32_param)
++        combined_params.append(combined_group_fp32_from_fp16_param)
++        combined_grads.append(combined_group_fp32_from_fp32_grad)
++        combined_grads.append(combined_group_fp32_from_fp16_grad)
++        params_list.append(group_fp32_from_fp32_params)
++        params_list.append(group_fp32_from_fp16_params)
++
++        stash.combined_params_indexed_by_group.append(combined_params)
++        stash.combined_grads_indexed_by_group.append(combined_grads)
++        stash.params_lists_indexed_by_group.append(params_list)
++
++    maybe_print("group num: {}".format(group_num))
++    stash.params_grads_are_combined_by_group = True
++
++
++def new_zero_grad_with_master_weights(self):
++    stash = self._amp_stash
++    self._amp_lazy_init()
++    # Zero the model grads.
++    for param in stash.all_fp16_params:
++        if param.grad is not None:
++            param.grad.detach_()
++            param.grad.zero_()
++    for param in stash.all_fp32_from_fp32_params:
++        if param.grad is not None:
++            param.grad.detach_()
++            param.grad.zero_()
++    # Clear the master grads that are independent of model grads
++    for param in stash.all_fp32_from_fp16_params:
++        param.grad = None
++
++
++def new_zero_grad_accelerate_with_master_weights(self):
++    stash = self._amp_stash
++    self._amp_lazy_init()
++    self._check_already_combined_params_and_grads()
++    # Zero the model grads.
++    stash.process_zero_grad = True
++
++    if not stash.already_combined:
++        for param in stash.all_fp16_params:
++            if param.grad is not None:
++                param.grad.detach_()
++                param.grad.zero_()
++        for param in stash.all_fp32_from_fp32_params:
++            if param.grad is not None:
++                param.grad.detach_()
++                param.grad.zero_()
++        for param in stash.all_fp32_from_fp16_params:
++            if param.grad is not None:
++                param.grad.zero_()
++        return
++
++    if stash.main_fp16_grad_combine is not None:
++        stash.main_fp16_grad_combine.zero_()
++    if stash.main_fp32_from_fp32_grad_combine is not None:
++        stash.main_fp32_from_fp32_grad_combine.zero_()
++    # Clear the master grads that are independent of model grads
++    if stash.main_fp32_from_fp16_grad_combine is not None:
++        stash.main_fp32_from_fp16_grad_combine.zero_()
++
++
++def can_get_combined_tensors(self, name):
++    if name == 'params':
++        if not self.is_npu_fused_optimizer:
++            maybe_print("To get combined params, please use npu fused optimizer.")
++            return False
++    elif name == 'grads' or name == 'grad_masks':
++        if (not self.accelerate) and (not self.is_npu_fused_optimizer):
++            maybe_print("To get combined {}, please set combine_grad=True or use npu fused optimizer.".format(name))
++            return False
++    else:
++        maybe_print("{} are not supported to be combined.".format(name))
++        return False
++
++    stash = self._amp_stash
++    if not stash.already_combined:
++        maybe_print("Please get the combined {} after backward phase.".format(name))
++        return False
++    return True
++
++
++def get_model_combined_params(self):
++    stash = self._amp_stash
++    combined_params = []
++
++    if not self._can_get_combined_tensors('params'):
++        return combined_params
++
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
++
++    if stash.master_weights:
++        combined_params.append(stash.main_fp16_param_combine)
++        combined_params.append(stash.main_fp32_from_fp32_param_combine)
++    else:
++        combined_params.append(stash.main_fp32_param_combine)
++    return combined_params
++
++
++def get_model_combined_grads(self):
++    stash = self._amp_stash
++    combined_grads = []
++
++    if not self._can_get_combined_tensors('grads'):
++        return combined_grads
++
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
++
++    if stash.master_weights:
++        combined_grads.append(stash.main_fp16_grad_combine)
++        combined_grads.append(stash.main_fp32_from_fp32_grad_combine)
++    else:
++        combined_grads.append(stash.main_fp32_grad_combine)
++    return combined_grads
++
++
++def get_optimizer_combined_params(self):
++    stash = self._amp_stash
++    combined_params = []
++
++    if not self._can_get_combined_tensors('params'):
++        return combined_params
++
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
++
++    if stash.master_weights:
++        combined_params.append(stash.main_fp32_from_fp16_param_combine)
++        combined_params.append(stash.main_fp32_from_fp32_param_combine)
++    else:
++        combined_params.append(stash.main_fp32_param_combine)
++    return combined_params
++
++
++def get_optimizer_combined_grads(self):
++    stash = self._amp_stash
++    combined_grads = []
++
++    if not self._can_get_combined_tensors('grads'):
++        return combined_grads
++
++    self._check_already_combined_params_and_grads()
++    self._amp_combined_init()
++
++    if stash.master_weights:
++        combined_grads.append(stash.main_fp32_from_fp16_grad_combine)
++        combined_grads.append(stash.main_fp32_from_fp32_grad_combine)
++    else:
++        combined_grads.append(stash.main_fp32_grad_combine)
++    return combined_grads
++
++
++def get_optimizer_combined_grad_masks(self):
++    stash = self._amp_stash
++    combined_grad_masks = []
++
++    if not self._can_get_combined_tensors('grad_masks'):
++        return combined_grad_masks
++
++    if stash.master_weights:
++        if stash.main_fp32_from_fp16_grad_combine_mask is None:
++            stash.main_fp32_from_fp16_grad_combine_mask = \
++                get_grad_combined_tensor_mask_from_param(stash.all_fp32_from_fp16_params)
++            stash.main_fp32_from_fp32_grad_combine_mask = \
++                get_grad_combined_tensor_mask_from_param(stash.all_fp32_from_fp32_params)
++        combined_grad_masks.append(stash.main_fp32_from_fp16_grad_combine_mask)
++        combined_grad_masks.append(stash.main_fp32_from_fp32_grad_combine_mask)
++    else:
++        if stash.main_fp32_grad_combine_mask is None:
++            stash.main_fp32_grad_combine_mask = \
++                get_grad_combined_tensor_mask_from_param(stash.all_fp32_params)
++        combined_grad_masks.append(stash.main_fp32_grad_combine_mask)
++    return combined_grad_masks
++
++
+ def _process_optimizer(optimizer, properties):
+     if hasattr(optimizer, "_amp_stash"):
+         raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
+@@ -327,15 +931,64 @@
+     optimizer._amp_stash.lazy_init_called = False
+     optimizer._amp_stash.already_patched = False
+     optimizer._amp_stash.params_have_scaled_gradients = False
++    optimizer.accelerate = properties.combine_grad
++    optimizer.check_combined_tensors = properties.check_combined_tensors
++    optimizer._amp_stash.master_weights = properties.master_weights
++    optimizer._amp_stash.grads_list = []
++    optimizer._amp_stash.already_combined = False
++
++    optimizer._amp_stash.process_zero_grad = True
++
++    optimizer._amp_stash.params_grads_are_combined_by_group = False
++    optimizer._amp_stash.combined_params_indexed_by_group = []
++    optimizer._amp_stash.combined_grads_indexed_by_group = []
++    optimizer._amp_stash.params_lists_indexed_by_group = []
++    optimizer._amp_stash.param_states_are_combined_by_group = False
++    optimizer._amp_stash.combined_param_states_indexed_by_group = []
+ 
+     for name in ("_lazy_init_maybe_master_weights",
+                  "_master_params_to_model_params",
+                  "_prepare_amp_backward",
+                  "_post_amp_backward",
+-                 "_amp_lazy_init"):
++                 "_amp_lazy_init",
++                 "_amp_combined_init",
++                 "_reset_all_combine_flags",
++                 "_check_already_combined_params_and_grads",
++                 "_combine_params_and_grads_by_group",
++                 "_can_get_combined_tensors",
++                 "get_model_combined_params",
++                 "get_model_combined_grads",
++                 "get_optimizer_combined_params",
++                 "get_optimizer_combined_grads"):
+         if hasattr(optimizer, name):
+             raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
+ 
++    if properties.opt_level == "O2" and properties.combine_grad and properties.master_weights != True:
++        raise RuntimeError("With opt_level O2, master_weights should be True when combine_grad is True")
++
++    if hasattr(optimizer, "is_npu_fused_optimizer") and optimizer.is_npu_fused_optimizer is True:
++        maybe_print("Use npu fused optimizer")
++    else:
++        optimizer.is_npu_fused_optimizer = False
++
++    if optimizer.is_npu_fused_optimizer:
++        if properties.opt_level != "O1" and properties.opt_level != "O2":
++            raise RuntimeError("Currently, npu fused optimizer can only be used when opt_level='O1' or opt_level='O2'")
++
++        if properties.opt_level == "O2" and properties.master_weights != True:
++            raise RuntimeError("With opt_level O2, master_weights should be True when npu fused optimizer is used")
++
++        old_load_state_dict = optimizer.load_state_dict
++        def new_load_state_dict(self, state_dict):
++            old_load_state_dict(state_dict)
++            self._amp_stash.param_states_are_combined_by_group = False
++        optimizer.load_state_dict = types.MethodType(new_load_state_dict, optimizer)
++
++    if not properties.combine_grad and not optimizer.is_npu_fused_optimizer and \
++        properties.check_combined_tensors:
++        maybe_print("Because combine_grad != True and no npu fused optimizer is used, "
++                    "checking combined tensors function will not take effect!")
++
+     # TODO:  Centralize exposure and import error checking for the C backend.
+     if multi_tensor_applier.available:
+         import amp_C
+@@ -352,34 +1005,31 @@
+ 
+         old_step = optimizer.step
+         def new_step(self, closure=None):
++            stash = self._amp_stash
+             if closure is not None:
+                 raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
+             retval = old_step()
+             if not isinstance(self, FusedSGD):
+                 self._master_params_to_model_params()
+             # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+-            for param in self._amp_stash.all_fp32_from_fp16_params:
+-                param.grad = None
++            if optimizer.accelerate or optimizer.is_npu_fused_optimizer:
++                if stash.main_fp32_from_fp16_grad_combine is not None:
++                    stash.main_fp32_from_fp16_grad_combine.zero_()
++            else:
++                for param in stash.all_fp32_from_fp16_params:
++                    param.grad = None
+             return retval
+         optimizer.step = types.MethodType(new_step, optimizer)
+ 
+         old_zero_grad = optimizer.zero_grad
+-        def new_zero_grad(self):
+-            stash = self._amp_stash
+-            self._amp_lazy_init()
+-            # Zero the model grads.
+-            for param in stash.all_fp16_params:
+-                if param.grad is not None:
+-                    param.grad.detach_()
+-                    param.grad.zero_()
+-            for param in stash.all_fp32_from_fp32_params:
+-                if param.grad is not None:
+-                    param.grad.detach_()
+-                    param.grad.zero_()
+-            # Clear the master grads that are independent of model grads
+-            for param in self._amp_stash.all_fp32_from_fp16_params:
+-                param.grad = None
+-        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
++        if optimizer.accelerate or optimizer.is_npu_fused_optimizer:
++            optimizer.zero_grad = types.MethodType(new_zero_grad_accelerate_with_master_weights, optimizer)
++        else:
++            optimizer.zero_grad = types.MethodType(new_zero_grad_with_master_weights, optimizer)
++
++        if optimizer.is_npu_fused_optimizer:
++            optimizer._combine_params_and_grads_by_group = types.MethodType(
++                combine_params_and_grads_by_group_with_master_weights, optimizer)
+ 
+         if isinstance(optimizer, FusedSGD):
+             optimizer._prepare_amp_backward = types.MethodType(
+@@ -391,10 +1041,35 @@
+                 prepare_backward_with_master_weights, optimizer)
+             optimizer._post_amp_backward = types.MethodType(
+                 post_backward_with_master_weights, optimizer)
++        
++        optimizer._amp_combined_init = types.MethodType(combined_init_with_master_weights, optimizer)
++        optimizer._check_already_combined_params_and_grads = types.MethodType(
++            check_already_combined_params_and_grads_with_master_weights, optimizer)
+     else:
+         optimizer._lazy_init_maybe_master_weights = types.MethodType(
+             lazy_init_no_master_weights, optimizer)
+ 
++        old_zero_grad = optimizer.zero_grad
++        if optimizer.accelerate or optimizer.is_npu_fused_optimizer:
++            def new_zero_grad_accelerate_no_master_weights(self):
++                stash = self._amp_stash
++                self._amp_lazy_init()
++                self._check_already_combined_params_and_grads()
++                # Zero the model grads.
++                if not stash.already_combined:
++                    old_zero_grad()
++                    return
++
++                if stash.main_fp16_grad_combine is not None:
++                    stash.main_fp16_grad_combine.zero_()
++                if stash.main_fp32_grad_combine is not None:
++                    stash.main_fp32_grad_combine.zero_()
++            optimizer.zero_grad = types.MethodType(new_zero_grad_accelerate_no_master_weights, optimizer)
++
++        if optimizer.is_npu_fused_optimizer:
++            optimizer._combine_params_and_grads_by_group = types.MethodType(
++                combine_params_and_grads_by_group_no_master_weights, optimizer)
++
+         if isinstance(optimizer, FusedSGD):
+             optimizer._prepare_amp_backward = types.MethodType(
+                 prepare_backward_no_master_weights_FusedSGD, optimizer)
+@@ -406,7 +1081,18 @@
+             optimizer._post_amp_backward = types.MethodType(
+                 post_backward_no_master_weights, optimizer)
+ 
++        optimizer._amp_combined_init = types.MethodType(combined_init_no_master_weights, optimizer)
++        optimizer._check_already_combined_params_and_grads = types.MethodType(
++            check_already_combined_params_and_grads_no_master_weights, optimizer)
++
+     optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
++    optimizer._reset_all_combine_flags = types.MethodType(reset_all_combine_flags, optimizer)
++    optimizer._can_get_combined_tensors = types.MethodType(can_get_combined_tensors, optimizer)
++    optimizer.get_model_combined_params = types.MethodType(get_model_combined_params, optimizer)
++    optimizer.get_model_combined_grads = types.MethodType(get_model_combined_grads, optimizer)
++    optimizer.get_optimizer_combined_params = types.MethodType(get_optimizer_combined_params, optimizer)
++    optimizer.get_optimizer_combined_grads = types.MethodType(get_optimizer_combined_grads, optimizer)
++    optimizer.get_optimizer_combined_grad_masks = types.MethodType(get_optimizer_combined_grad_masks, optimizer)
+ 
+     old_add_param_group = optimizer.add_param_group
+ 
+@@ -435,13 +1121,13 @@
+             fp32_from_fp16_params_this_group = []
+             for i, param in enumerate(new_group['params']):
+                 if param.requires_grad:
+-                    if param.type() == 'torch.cuda.HalfTensor':
++                    if param.type() == 'torch.npu.HalfTensor':
+                         fp16_params_this_group.append(param)
+                         master_param = param.detach().clone().float()
+                         master_param.requires_grad = True
+                         new_group['params'][i] = master_param
+                         fp32_from_fp16_params_this_group.append(master_param)
+-                    elif param.type() == 'torch.cuda.FloatTensor':
++                    elif param.type() == 'torch.npu.FloatTensor':
+                         fp32_params_this_group.append(param)
+                         new_group['params'][i] = param
+                     else:
+@@ -471,10 +1157,10 @@
+             #     param.grad = None
+         else:
+             for param in new_group['params']:
+-                if param.type() == 'torch.cuda.HalfTensor':
++                if param.type() == 'torch.npu.HalfTensor':
+                     stash.all_fp16_params.append(param)
+                     stash.all_fp16_grad_stash.append(None)
+-                elif param.type() == 'torch.cuda.FloatTensor':
++                elif param.type() == 'torch.npu.FloatTensor':
+                     stash.all_fp32_params.append(param)
+                     stash.all_fp32_grad_stash.append(None)
+                 else:
+diff -Nur '--exclude=.git' apex/apex/amp/frontend.py apex-npu/apex/amp/frontend.py
+--- apex/apex/amp/frontend.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/frontend.py	2021-06-17 07:10:45.373711948 +0000
+@@ -19,6 +19,8 @@
+             "keep_batchnorm_fp32" : None,
+             "master_weights" : None,
+             "loss_scale" : 1.0,
++            "combine_grad": None,
++            "check_combined_tensors": None
+             # Reserved for future functionality
+             # "fused_optimizer" : False,
+             # "enable_ddp_interop" : False,
+@@ -91,6 +93,11 @@
+                         self.options[name] = value
+                     else:
+                         self.options[name] = float(value)
++                elif name == "combine_grad" or name == "check_combined_tensors":
++                    if self.opt_level not in ["O1", "O2"] and value:
++                        warn_or_err("Currently, combine_grad=True or check_combined_tensors=True should only be set "
++                                    "by selecting opt_level='O1' or opt_level='O2'.")
++                    self.options[name] = value
+                 else:
+                     self.options[name] = value
+         else:
+@@ -161,6 +168,7 @@
+         properties.keep_batchnorm_fp32 = None
+         properties.master_weights = None
+         properties.loss_scale = "dynamic"
++        properties.combine_grad = None
+         # properties.fused_optimizer = False
+         # properties.enable_ddp_interop = False
+         return properties # modified in place so this isn't really necessary
+@@ -206,7 +214,9 @@
+     num_losses=1,
+     verbosity=1,
+     min_loss_scale=None,
+-    max_loss_scale=2.**24
++    max_loss_scale=2.**24,
++    combine_grad=None,
++    check_combined_tensors=None
+     ):
+     """
+     Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
+@@ -259,6 +269,9 @@
+             If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+         max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
+             dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
++        combine_grad (bool, optional, default=None): If True, make gradients fused for unscale.
++        check_combined_tensors (bool, optional, default=None): If True, check if the combined grads and combined params
++            are valid during training
+ 
+     Returns:
+         Model(s) and optimizer(s) modified according to the ``opt_level``.
+@@ -306,6 +319,7 @@
+         https://github.com/NVIDIA/apex/issues
+     """
+     _amp_state.opt_properties = Properties()
++    # Here add a switch to open combine tensor
+     _amp_state.verbosity = verbosity
+ 
+     if not enabled:
+@@ -350,6 +364,10 @@
+         _amp_state.opt_properties.master_weights = master_weights
+     if loss_scale is not None:
+         _amp_state.opt_properties.loss_scale = loss_scale
++    if combine_grad is not None:
++        _amp_state.opt_properties.combine_grad = combine_grad
++    if check_combined_tensors is not None:
++        _amp_state.opt_properties.check_combined_tensors = check_combined_tensors
+ 
+     maybe_print("After processing overrides, optimization options are:", True)
+     for k, v in _amp_state.opt_properties.options.items():
+diff -Nur '--exclude=.git' apex/apex/amp/handle.py apex-npu/apex/amp/handle.py
+--- apex/apex/amp/handle.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/handle.py	2021-06-17 07:10:45.373711948 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import contextlib
+ import warnings
+ import sys
+@@ -110,6 +126,9 @@
+                 if not optimizer._amp_stash.params_have_scaled_gradients:
+                     optimizer._prepare_amp_backward()
+ 
++    if loss_scaler.dynamic:
++        LossScaler.clear_npu_overflow_flag()
++
+     yield (loss.float())*loss_scale
+ 
+     if delay_unscale:
+@@ -142,8 +161,12 @@
+                                 # Maybe skip should delegate to a method owned by the optimizers themselves.
+                                 if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+                                     # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+-                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
+-                                        param.grad = None
++                                    if opt.accelerate or opt.is_npu_fused_optimizer:
++                                        if opt._amp_stash.main_fp32_from_fp16_grad_combine is not None:
++                                            opt._amp_stash.main_fp32_from_fp16_grad_combine.zero_()
++                                    else:
++                                        for param in opt._amp_stash.all_fp32_from_fp16_params:
++                                            param.grad = None
+                                 if hasattr(opt, "most_recent_scale"):
+                                     opt.most_recent_scale = 1.0
+                                     opt.scale_set_by_backward = False
+diff -Nur '--exclude=.git' apex/apex/amp/scaler.py apex-npu/apex/amp/scaler.py
+--- apex/apex/amp/scaler.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/scaler.py	2021-06-17 07:10:45.373711948 +0000
+@@ -1,7 +1,25 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import torch
++import torch.distributed as dist
+ from ..multi_tensor_apply import multi_tensor_applier
+ from ._amp_state import _amp_state, master_params, maybe_print
+ from itertools import product
++import importlib
+ 
+ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
+     # Exception handling for 18.04 compatibility
+@@ -16,7 +34,8 @@
+         master_grad.mul_(scale)
+     return False
+ 
+-def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
++def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, use_npu_fused_optimizer, 
++                                check_overflow=False):
+     # Exception handling for 18.04 compatibility
+     if check_overflow:
+         cpu_sum = float(model_grad.float().sum())
+@@ -27,13 +46,17 @@
+     #     master_grad.copy_(model_grad)
+     assert stashed_grad.dtype == master_grad.dtype
+     converted_model_grad = model_grad.data.to(master_grad.dtype)
+-    master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
++    if use_npu_fused_optimizer:
++        master_grad.data[:] = a*converted_model_grad.data + b*stashed_grad.data
++    else:
++        master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
+     return False
+ 
+ class LossScaler(object):
+     warned_no_fused_kernel = False
+     warned_unscaling_non_fp32_grad = False
+     has_fused_kernel = False
++    npu_float_status = None
+ 
+     def __init__(self,
+                  loss_scale,
+@@ -45,50 +68,89 @@
+         if loss_scale == "dynamic":
+             self.dynamic = True
+             self._loss_scale = min(max_loss_scale, init_scale)
++            try:
++                LossScaler.npu_float_status = importlib.import_module("npu_float_status")
++            except ModuleNotFoundError as module_err:
++                maybe_print(
++                    '\nImport module "npu_float_status" failed, '
++                    'please install apex with --global-option="--npu_float_status" and then try again!\n')
++                raise(module_err)
++            except Exception as other_err:
++                raise(other_err)
+         else:
+             self.dynamic = False
+             self._loss_scale = loss_scale
++            LossScaler.npu_float_status = None
+         self._max_loss_scale = max_loss_scale
+         self._min_loss_scale = min_loss_scale
+         self._scale_seq_len = scale_window
+         self._unskipped = 0
+         self._has_overflow = False
+-        self._overflow_buf = torch.cuda.IntTensor([0])
++        self._overflow_buf = torch.npu.IntTensor([0])
++        self._dist_overflow_count = torch.Tensor([0.]).to('npu')
++        self._dist_initialized = False
++
++        try:
++            if dist.is_initialized():
++                self._dist_initialized = True
++        except AttributeError as err:
++            maybe_print("torch.distributed has no attribute is_initialized")
++
+         if multi_tensor_applier.available:
+             import amp_C
+             LossScaler.has_fused_kernel = multi_tensor_applier.available
+             LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
+             LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
+         else:
+-            if not LossScaler.warned_no_fused_kernel:
+-                maybe_print(
+-                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
+-                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
+-                    "Using Python fallback.  Original ImportError was: " +
+-                    repr(multi_tensor_applier.import_err),
+-                    True)
+             LossScaler.has_fused_kernel = False
+             LossScaler.warned_no_fused_kernel = True
+ 
++    @staticmethod
++    def get_npu_overflow_flag():
++        if LossScaler.npu_float_status is not None:
++            return LossScaler.npu_float_status.RunGetFloatStatusOp()
++        else:
++            return False
++    
++    @staticmethod
++    def clear_npu_overflow_flag():
++        if LossScaler.npu_float_status is not None:
++            LossScaler.npu_float_status.RunClearFloatStatusOp()
++
+     def loss_scale(self):
+         return self._loss_scale
+ 
+     def unscale_python(self, model_grads, master_grads, scale):
+-        for model, master in zip(model_grads, master_grads):
+-            if model is not None:
+-                if not LossScaler.warned_unscaling_non_fp32_grad:
+-                    if master.dtype != torch.float32:
+-                        maybe_print(
+-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+-                            "Unscaling non-fp32 grads may indicate an error. "
+-                            "When using Amp, you don't need to call .half() on your model.")
+-                        LossScaler.warned_unscaling_non_fp32_grad = True
+-                self._has_overflow = scale_check_overflow_python(model,
+-                                                                 master,
+-                                                                 1./scale,
+-                                                                 self.dynamic)
+-                if self._has_overflow and self.dynamic:
+-                    break
++        if self.dynamic:
++            self._has_overflow = LossScaler.get_npu_overflow_flag()
++        else:
++            self._has_overflow = False
++
++        if not self._has_overflow:
++            for model, master in zip(model_grads, master_grads):
++                if model is not None:
++                    if not LossScaler.warned_unscaling_non_fp32_grad:
++                        if master.dtype != torch.float32:
++                            maybe_print(
++                                "Attempting to unscale a grad with type {} ".format(master.type()) +
++                                "Unscaling non-fp32 grads may indicate an error. "
++                                "When using Amp, you don't need to call .half() on your model.")
++                            LossScaler.warned_unscaling_non_fp32_grad = True
++                    self._has_overflow = scale_check_overflow_python(model,
++                                                                     master,
++                                                                     1./scale)
++
++        if self._has_overflow:
++            if self.dynamic and self._dist_initialized:
++                self._dist_overflow_count.add_(1)
++                dist.all_reduce(self._dist_overflow_count)
++                self._dist_overflow_count.zero_()
++        else:
++            if self.dynamic and self._dist_initialized:
++                dist.all_reduce(self._dist_overflow_count)
++                if self._dist_overflow_count.item() != 0:
++                    self._has_overflow = True
++                self._dist_overflow_count.zero_()
+ 
+     # unused_scale keeps some of the old API alive for hopefully a short time.
+     def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
+@@ -117,7 +179,7 @@
+                                  1./scale)
+         else:
+             self.unscale_python(model_grads, master_grads, scale)
+-
++        
+         # Defer to update_scale
+         # If the fused kernel is available, we only need one D2H memcopy and sync.
+         # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+@@ -128,32 +190,50 @@
+                                     stashed_master_grads,
+                                     master_grads,
+                                     a,
+-                                    b):
+-        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
+-            if model is None and stashed is None:
+-                continue
+-            else:
+-                if not LossScaler.warned_unscaling_non_fp32_grad:
+-                    if master.dtype != torch.float32:
+-                        maybe_print(
+-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+-                            "Unscaling non-fp32 grads may indicate an error. "
+-                            "When using Amp, you don't need to call .half() on your model.")
+-                        LossScaler.warned_unscaling_non_fp32_grad = True
+-                self._has_overflow = axpby_check_overflow_python(model,
+-                                                                 stashed,
+-                                                                 master,
+-                                                                 a,
+-                                                                 b,
+-                                                                 self.dynamic)
+-                if self._has_overflow and self.dynamic:
+-                    break
++                                    b,
++                                    use_npu_fused_optimizer):
++        if self.dynamic:
++            self._has_overflow = LossScaler.get_npu_overflow_flag()
++        else:
++            self._has_overflow = False
++
++        if not self._has_overflow:
++            for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
++                if model is None and stashed is None:
++                    continue
++                else:
++                    if not LossScaler.warned_unscaling_non_fp32_grad:
++                        if master.dtype != torch.float32:
++                            maybe_print(
++                                "Attempting to unscale a grad with type {} ".format(master.type()) +
++                                "Unscaling non-fp32 grads may indicate an error. "
++                                "When using Amp, you don't need to call .half() on your model.")
++                            LossScaler.warned_unscaling_non_fp32_grad = True
++                    self._has_overflow = axpby_check_overflow_python(model,
++                                                                     stashed,
++                                                                     master,
++                                                                     a,
++                                                                     b,
++                                                                     use_npu_fused_optimizer)
++
++        if self._has_overflow:
++            if self.dynamic and self._dist_initialized:
++                self._dist_overflow_count.add_(1)
++                dist.all_reduce(self._dist_overflow_count)
++                self._dist_overflow_count.zero_()
++        else:
++            if self.dynamic and self._dist_initialized:
++                dist.all_reduce(self._dist_overflow_count)
++                if self._dist_overflow_count.item() != 0:
++                    self._has_overflow = True
++                self._dist_overflow_count.zero_()
+ 
+     def unscale_with_stashed(self,
+                              model_grads,
+                              stashed_master_grads,
+                              master_grads,
+-                             scale_override=None):
++                             scale_override=None,
++                             use_npu_fused_optimizer=False):
+         if self._has_overflow:
+             return
+ 
+@@ -181,13 +261,86 @@
+                                              stashed_master_grads,
+                                              master_grads,
+                                              out_scale/grads_have_scale,
+-                                             out_scale/stashed_have_scale)
++                                             out_scale/stashed_have_scale,
++                                             use_npu_fused_optimizer)
+ 
+         # Defer to update_scale
+         # If the fused kernel is available, we only need one D2H memcopy and sync.
+         # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+         #     self._has_overflow = self._overflow_buf.item()
+ 
++    def unscale_with_stashed_combined(self,
++                                      grads_combined,
++                                      stashed_grads_combined,
++                                      scale_override=None):
++
++        if self.dynamic:
++            self._has_overflow = LossScaler.get_npu_overflow_flag()
++            if self._dist_initialized:
++                if self._has_overflow:
++                    self._dist_overflow_count.add_(1)
++                    dist.all_reduce(self._dist_overflow_count)
++                    self._dist_overflow_count.zero_()
++                else:
++                    dist.all_reduce(self._dist_overflow_count)
++                    if self._dist_overflow_count.item() != 0:
++                        self._has_overflow = True
++                    self._dist_overflow_count.zero_()
++
++        if self._has_overflow:
++            return
++        
++        grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
++        if scale_override is not None:
++            grads_have_scale, stashed_have_scale, out_scale = scale_override
++
++        grads_combined.data[:] = grads_combined.mul_(out_scale/grads_have_scale) + stashed_grads_combined
++
++    def unscale_grad_O2(self,
++                        model_grads_combined=None,
++                        stashed_master_grads_combined=None,
++                        master_grads_combined=None,
++                        scale_override=None,
++                        master_grads=None,
++                        model_grads=None):
++
++        if self.dynamic:
++            self._has_overflow = LossScaler.get_npu_overflow_flag()
++            if self._dist_initialized:
++                if self._has_overflow:
++                    self._dist_overflow_count.add_(1)
++                    dist.all_reduce(self._dist_overflow_count)
++                    self._dist_overflow_count.zero_()
++                else:
++                    dist.all_reduce(self._dist_overflow_count)
++                    if self._dist_overflow_count.item() != 0:
++                        self._has_overflow = True
++                    self._dist_overflow_count.zero_()
++
++        if self._has_overflow:
++            return
++        
++        grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
++        if scale_override is not None:
++            grads_have_scale, stashed_have_scale, out_scale = scale_override
++
++        if stashed_master_grads_combined is not None and \
++                master_grads_combined.data_ptr() == stashed_master_grads_combined.data_ptr() and \
++                master_grads_combined.numel() == stashed_master_grads_combined.numel():
++            stashed_master_grads_combined = master_grads_combined.clone()
++
++        if master_grads_combined is not model_grads_combined:
++            if master_grads_combined.numel() == model_grads_combined.numel():
++                master_grads_combined.copy_(model_grads_combined)
++            else:
++                for master, model in zip(master_grads, model_grads):
++                    master.copy_(model)
++        master_grads_combined.mul_(out_scale/grads_have_scale)
++
++        if stashed_master_grads_combined is not None:
++            assert stashed_master_grads_combined.dtype == master_grads_combined.dtype
++            master_grads_combined.add_(stashed_master_grads_combined)
++
+     def clear_overflow_state(self):
+         self._has_overflow = False
+         if self.has_fused_kernel:
+diff -Nur '--exclude=.git' apex/apex/amp/utils.py apex-npu/apex/amp/utils.py
+--- apex/apex/amp/utils.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/amp/utils.py	2021-06-17 07:10:45.377711979 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ from . import compat
+ 
+ import functools
+@@ -55,7 +71,7 @@
+     if is_nested(x):
+         return type(x)([maybe_half(y) for y in x])
+ 
+-    if not x.is_cuda or type_string(x) == 'HalfTensor':
++    if not 'npu' in x.type()  or type_string(x) == 'HalfTensor':
+         return x
+     else:
+         if verbose:
+@@ -66,7 +82,7 @@
+     if is_nested(x):
+         return type(x)([maybe_float(y) for y in x])
+ 
+-    if not x.is_cuda or type_string(x) == 'FloatTensor':
++    if not 'npu' in x.type() or type_string(x) == 'FloatTensor':
+         return x
+     else:
+         if verbose:
+@@ -94,7 +110,7 @@
+         cached_x = cache[x]
+         if x.requires_grad and cached_x.requires_grad:
+             # Make sure x is actually cached_x's autograd parent.
+-            if cached_x.grad_fn.next_functions[1][0].variable is not x:
++            if cached_x.grad_fn.next_functions[0][0].variable is not x:
+                 raise RuntimeError("x and cache[x] both require grad, but x is not "
+                                    "cache[x]'s parent.  This is likely an error.")
+         # During eval, it's possible to end up caching casted weights with
+diff -Nur '--exclude=.git' apex/apex/contrib/sparsity/README.md apex-npu/apex/contrib/sparsity/README.md
+--- apex/apex/contrib/sparsity/README.md	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/contrib/sparsity/README.md	2021-06-17 07:10:45.389712070 +0000
+@@ -1,37 +1,37 @@
+-# Introduction to ASP
+-
+-This page documents the API for ASP (Automatic Sparsity), a tool that enables sparse training and inference for PyTorch models by adding 2 lines of Python.
+-
+-## Importing ASP
+-```
+-from apex.contrib.sparsity import ASP
+-```
+-
+-## Initializing ASP
+-
+-Apart from the import statement, it is sufficient to add just the following line of code before the training phase to augment the model and the optimizer for sparse training/infercence:
+-```
+-ASP.prune_trained_model(model, optimizer)
+-```
+-
+-In a typical PyTorch training loop, it might look like this:
+-
+-```
+-ASP.prune_trained_model(model, optimizer)
+-
+-x, y = DataLoader(args)
+-for epoch in range(epochs):
+-    y_pred = model(x)
+-    loss = loss_function(y_pred, y)
+-    loss.backward()
+-    optimizer.step()
+-
+-torch.save(...)
+-```
+-The `prune_trained_model` calculates the sparse mask and applies it to the weights. This is done once, i.e., sparse locations in the weights matrix remain fixed after this step. In order to recompute the sparse mask in between training, say after an epoch, use the following method:
+-
+-```
+-ASP.compute_sparse_masks()
+-```
+-
++# Introduction to ASP
++
++This page documents the API for ASP (Automatic Sparsity), a tool that enables sparse training and inference for PyTorch models by adding 2 lines of Python.
++
++## Importing ASP
++```
++from apex.contrib.sparsity import ASP
++```
++
++## Initializing ASP
++
++Apart from the import statement, it is sufficient to add just the following line of code before the training phase to augment the model and the optimizer for sparse training/infercence:
++```
++ASP.prune_trained_model(model, optimizer)
++```
++
++In a typical PyTorch training loop, it might look like this:
++
++```
++ASP.prune_trained_model(model, optimizer)
++
++x, y = DataLoader(args)
++for epoch in range(epochs):
++    y_pred = model(x)
++    loss = loss_function(y_pred, y)
++    loss.backward()
++    optimizer.step()
++
++torch.save(...)
++```
++The `prune_trained_model` calculates the sparse mask and applies it to the weights. This is done once, i.e., sparse locations in the weights matrix remain fixed after this step. In order to recompute the sparse mask in between training, say after an epoch, use the following method:
++
++```
++ASP.compute_sparse_masks()
++```
++
+ A more thorough example can be found in `./test/toy_problem.py`. 
+\ No newline at end of file
+diff -Nur '--exclude=.git' apex/apex/fp16_utils/fp16_optimizer.py apex-npu/apex/fp16_utils/fp16_optimizer.py
+--- apex/apex/fp16_utils/fp16_optimizer.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/fp16_utils/fp16_optimizer.py	2021-06-17 07:10:45.389712070 +0000
+@@ -1,554 +1,554 @@
+-import torch
+-from torch import nn
+-from torch.autograd import Variable
+-from torch.nn.parameter import Parameter
+-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+-
+-from ..amp._amp_state import _amp_state, maybe_print
+-from ..amp.scaler import LossScaler
+-from ..multi_tensor_apply import multi_tensor_applier
+-from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
+-
+-# TODO:  Update overflow check + downscale to use Carl's fused kernel.
+-class FP16_Optimizer(object):
+-    def __init__(self, 
+-                 init_optimizer, 
+-                 static_loss_scale=1.0, 
+-                 dynamic_loss_scale=False,
+-                 dynamic_loss_args=None,
+-                 verbose=True):
+-        print("Warning:  FP16_Optimizer is deprecated and dangerous, and will be deleted soon.  "
+-              "If it still works, you're probably getting lucky.  "
+-              "For mixed precision, use the documented API https://nvidia.github.io/apex/amp.html, with opt_level=O1.")
+-
+-        if not torch.cuda.is_available:
+-            raise SystemError("Cannot use fp16 without CUDA.")
+-
+-        self.verbose = verbose
+-
+-        self.optimizer = init_optimizer
+-        # init_state_dict sets up an alternative way to cast per-param state tensors.
+-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+-        # init_state_dict = init_optimizer.state_dict()
+-
+-        self.fp16_groups = []
+-        self.fp32_from_fp16_groups = []
+-        self.fp32_from_fp32_groups = []
+-        for i, param_group in enumerate(self.optimizer.param_groups):
+-            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
+-            fp16_params_this_group = []
+-            fp32_params_this_group = []
+-            fp32_from_fp16_params_this_group = []
+-            for i, param in enumerate(param_group['params']):
+-                if param.requires_grad:
+-                    if param.type() == 'torch.cuda.HalfTensor':
+-                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+-                                         .format(param.size()))
+-                        fp16_params_this_group.append(param)
+-                        master_param = param.detach().clone().float()
+-                        master_param.requires_grad = True
+-                        param_group['params'][i] = master_param
+-                        fp32_from_fp16_params_this_group.append(master_param)
+-                        # Reset existing state dict key to the new master param.
+-                        # We still need to recast per-param state tensors, if any, to FP32.
+-                        if param in self.optimizer.state:
+-                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
+-                    elif param.type() == 'torch.cuda.FloatTensor':
+-                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+-                                         .format(param.size()))
+-                        fp32_params_this_group.append(param)
+-                        param_group['params'][i] = param
+-                    else:
+-                        raise TypeError("Wrapped parameters must be either "
+-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
+-                                        "Received {}".format(param.type()))
+-            
+-            self.fp16_groups.append(fp16_params_this_group)
+-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+-
+-        self.all_fp16_params = []
+-        for group in self.fp16_groups:
+-            self.all_fp16_params += group
+-
+-        self.all_fp32_from_fp16_params = []
+-        for group in self.fp32_from_fp16_groups:
+-            self.all_fp32_from_fp16_params += group
+-
+-        self.all_fp32_from_fp32_params = []
+-        for group in self.fp32_from_fp32_groups:
+-            self.all_fp32_from_fp32_params += group
+-
+-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+-        self.optimizer.load_state_dict(self.optimizer.state_dict())
+-        # alternative way to cast per-param state tensors:
+-        # self.optimizer.load_state_dict(init_state_dict)
+-
+-        if dynamic_loss_scale:
+-            self.dynamic_loss_scale = True
+-            if dynamic_loss_args is not None:
+-                self.loss_scaler = LossScaler("dynamic", **dynamic_loss_args)
+-            else:
+-                self.loss_scaler = LossScaler("dynamic")
+-        else:
+-            self.dynamic_loss_scale = False
+-            self.loss_scaler = LossScaler(static_loss_scale)
+-
+-        self.overflow = False
+-        self.first_closure_call_this_step = True
+-
+-        self.clip_grad_norm = clip_grad_norm
+-
+-        # TODO:  Centralize exposure and import error checking for the C backend.
+-        if multi_tensor_applier.available:
+-            import amp_C
+-            self.multi_tensor_scale = amp_C.multi_tensor_scale
+-            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
+-
+-    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
+-    # of having to support FP16_Optimizer separately, for the time being.
+-    def maybe_print(self, msg):
+-        if self.verbose:
+-            print(msg)
+-            
+-    def __getstate__(self):
+-        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
+-
+-    def __setstate__(self, state):
+-        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
+-
+-    def zero_grad(self, set_grads_to_None=False):
+-        """
+-        Zero fp32 and fp16 parameter grads.
+-        """
+-        # In principle, only the .grad attributes of the model params need to be zeroed,
+-        # because gradients are copied into the FP32 master params.  However, we zero
+-        # all gradients owned by the optimizer, just to be safe:
+-        for group in self.optimizer.param_groups:
+-             for p in group['params']:
+-                 if set_grads_to_None:
+-                     p.grad = None
+-                 else:
+-                     if p.grad is not None:
+-                         p.grad.detach_()
+-                         p.grad.zero_()
+-
+-        # Zero fp16 gradients owned by the model:
+-        for fp16_group in self.fp16_groups:
+-            for param in fp16_group:
+-                if set_grads_to_None:
+-                    param.grad = None
+-                else:
+-                    if param.grad is not None:
+-                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
+-                        param.grad.zero_()
+-
+-    # Should not be used anymore.
+-    # def _check_overflow(self):
+-    #     params = []
+-    #     for group in self.fp16_groups:
+-    #         for param in group:
+-    #             params.append(param)
+-    #     for group in self.fp32_from_fp32_groups:
+-    #         for param in group:
+-    #             params.append(param)
+-    #     self.overflow = self.loss_scaler.has_overflow(params)
+-
+-    # def _update_scale(self, has_overflow=False):
+-    #     self.loss_scaler.update_scale(has_overflow)
+-
+-    def _master_params_to_model_params(self):
+-        if multi_tensor_applier.available:
+-            if len(self.all_fp16_params) > 0:
+-                multi_tensor_applier(
+-                    self.multi_tensor_scale,
+-                    self._dummy_overflow_buf,
+-                    [self.all_fp32_from_fp16_params, self.all_fp16_params],
+-                    1.0)
+-        else:
+-            for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+-                master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+-
+-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+-    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+-    # def _model_grads_to_master_grads(self):
+-    #     for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
+-    #         model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+-
+-    # def _downscale_master(self):
+-    #     if self.loss_scale != 1.0:
+-    #         for group in self.optimizer.param_groups:
+-    #             for param in group['params']:
+-    #                 if param.grad is not None:
+-    #                     param.grad.data.mul_(1./self.loss_scale)
+-
+-    def clip_master_grads(self, max_norm, norm_type=2):
+-        """
+-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+-
+-        Args:
+-            max_norm (float or int): max norm of the gradients
+-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+-                infinity norm.
+-
+-        Returns:
+-            Total norm of the current fp32 gradients (viewed as a single vector).
+-
+-        .. warning::
+-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
+-        """
+-        if not self.overflow:
+-            fp32_params = []
+-            for param_group in self.optimizer.param_groups:
+-                for param in param_group['params']:
+-                    fp32_params.append(param)
+-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+-        else:
+-            return -1
+-
+-    def state_dict(self):
+-        """
+-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+-        of the contained Pytorch optimizer.
+-        Example::
+-
+-            checkpoint = {}
+-            checkpoint['model'] = model.state_dict()
+-            checkpoint['optimizer'] = optimizer.state_dict()
+-            torch.save(checkpoint, "saved.pth")
+-        """
+-        state_dict = {}
+-        state_dict['loss_scaler'] = self.loss_scaler
+-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+-        state_dict['overflow'] = self.overflow
+-        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
+-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+-        return state_dict
+-
+-    def load_state_dict(self, state_dict):
+-        """
+-        Loads a state_dict created by an earlier call to state_dict(). 
+-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
+-        whose parameters in turn came from ``model``, it is expected that the user 
+-        will call ``model.load_state_dict()`` before
+-        ``fp16_optimizer_instance.load_state_dict()`` is called.
+-
+-        Example::
+-
+-            model = torch.nn.Linear(D_in, D_out).cuda().half()
+-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+-            ...
+-            checkpoint = torch.load("saved.pth")
+-            model.load_state_dict(checkpoint['model'])
+-            optimizer.load_state_dict(checkpoint['optimizer'])
+-        """
+-        # I think it should actually be ok to reload the optimizer before the model.
+-        self.loss_scaler = state_dict['loss_scaler']
+-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+-        self.overflow = state_dict['overflow']
+-        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
+-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+-        # The optimizer's hyperparameters and internal buffers are also up to date.  
+-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+-        # out of date.  There are two options.  
+-        # 1:  Refresh the master params from the model's fp16 params.  
+-        # This requires less storage but incurs precision loss.
+-        # 2:  Save and restore the fp32 master copies separately.
+-        # We choose option 2.
+-        # 
+-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
+-        # of their associated parameters, because it's possible those buffers might not exist yet in 
+-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
+-        # constructed in the same way as the one whose state_dict we are loading, the same master params
+-        # are guaranteed to exist, so we can just copy_() from the saved master params.
+-        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
+-            for current, saved in zip(current_group, saved_group):
+-                current.data.copy_(saved.data)
+-
+-    def step(self, closure=None): # could add clip option.
+-        """
+-        If no closure is supplied, :attr:`step` should be called after 
+-        ``fp16_optimizer_obj.backward(loss)``.
+-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+-        another forward pass using their model.
+-
+-        If a closure is supplied, :attr:`step` may be called without a prior call to 
+-        :attr:`backward(loss)`.
+-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+-        However, the user should take care that any ``loss.backward()`` call within the closure
+-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+-
+-        Args:
+-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
+-
+-        Example with closure::
+-
+-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
+-            # existing pytorch optimizer.
+-            for input, target in dataset:
+-                def closure():
+-                    optimizer.zero_grad()
+-                    output = model(input)
+-                    loss = loss_fn(output, target)
+-                    # loss.backward() becomes:
+-                    optimizer.backward(loss)
+-                    return loss
+-                optimizer.step(closure)
+-
+-        .. warning::
+-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+-
+-        .. _`ordinary Pytorch optimizer use`:
+-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+-        """
+-
+-        scale = self.loss_scaler.loss_scale()
+-        # To consider:  Should this be in step(), or update_master_grads?  It works either way,
+-        # but I should make it consistent with the Amp control flow, which updates the scale
+-        # during backward context manager exit.
+-        # self._update_scale(self.overflow)
+-
+-        if self.overflow:
+-            # Using _amp_state.maybe_print instead of self.print here is intentional.
+-            maybe_print("Gradient overflow.  Skipping step, reducing " +
+-                "loss scale to {}".format(self.loss_scaler.loss_scale()))
+-            return
+-        
+-        if closure is not None:
+-            retval = self._step_with_closure(closure)
+-        else:
+-            # torch.cuda.nvtx.range_push("pytorch optimizer step")
+-            retval = self.optimizer.step()
+-            # torch.cuda.nvtx.range_pop()
+-
+-        self._master_params_to_model_params()
+-
+-        return retval
+-
+-    def _step_with_closure(self, closure):
+-        def wrapped_closure():
+-            # helpful for debugging
+-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+-            #       .format(self.first_closure_call_this_step))
+-            if self.first_closure_call_this_step:
+-                # We expect that the fp16 params are initially fresh on entering self.step(),
+-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+-                # is called within self.optimizer.step().
+-                self.first_closure_call_this_step = False
+-            else:
+-                # If self.optimizer.step() internally calls wrapped_closure more than once,
+-                # it may update the fp32 params after each call.  However, self.optimizer 
+-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+-                # to handle that manually:
+-                self._master_params_to_model_params()
+-            # Our API expects the user to give us ownership of the backward() call by
+-            # replacing all calls to loss.backward() with optimizer.backward(loss).
+-            # This requirement holds whether or not the call to backward() is made within a closure.
+-            # If the user is properly calling optimizer.backward(loss) within "closure," 
+-            # calling closure() here will give the fp32 master params fresh gradients
+-            # for the optimizer to play with, so all wrapped_closure needs to do is call 
+-            # closure() and return the loss.
+-            temp_loss = closure() 
+-            while(self.overflow):
+-                scale = self.loss_scaler.loss_scale()
+-                # self._update_scale(self.overflow) # now done at the end of backward
+-                print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
+-                      self.loss_scaler.loss_scale()))
+-                temp_loss = closure()
+-            return temp_loss
+-
+-        retval = self.optimizer.step(wrapped_closure)
+-
+-        self.first_closure_call_this_step = True
+-
+-        return retval
+-
+-    def backward(self, loss, update_master_grads=True, retain_graph=False):
+-        """ 
+-        :attr:`backward` performs the following conceptual steps:
+-
+-        1. fp32_loss = loss.float() (see first Note below)
+-        2. scaled_loss = fp32_loss*loss_scale
+-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
+-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
+-        5. Finally, master grads are divided by loss_scale.
+-
+-        In this way, after :attr:`backward`, the master params have fresh gradients,
+-        and :attr:`step` may be called.
+-
+-        .. note::
+-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+-            This provides some additional safety against overflow if the user has supplied an 
+-            fp16 loss value.  
+-            However, for maximum overflow safety, the user should
+-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
+-            :attr:`backward`.
+-
+-        .. warning::
+-            The gradients found in a model's leaves after the call to 
+-            :attr:`backward` should not be regarded as valid in general, 
+-            because it's possible 
+-            they have been scaled (and in the case of dynamic loss scaling, 
+-            the scale factor may change over time).  
+-            If the user wants to inspect gradients after a call to :attr:`backward`,  
+-            only the master gradients should be regarded as valid.  These can be retrieved via
+-            :attr:`inspect_master_grad_data()`.
+-
+-        Args:
+-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
+-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
+-
+-        Example::
+-
+-            # Ordinary operation:
+-            optimizer.backward(loss)
+-
+-            # Naive operation with multiple losses (technically valid, but less efficient):
+-            # fp32 grads will be correct after the second call,  but 
+-            # the first call incurs an unnecessary fp16->fp32 grad copy.
+-            optimizer.backward(loss1)
+-            optimizer.backward(loss2)
+-
+-            # More efficient way to handle multiple losses:
+-            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
+-            # losses have been accumulated.
+-            optimizer.backward(loss1, update_master_grads=False)
+-            optimizer.backward(loss2, update_master_grads=False)
+-            optimizer.update_master_grads()
+-        """ 
+-        # To consider:  try multiple backward passes using retain_grad=True to find 
+-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
+-        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
+-        scaled_loss = loss.float()*self.loss_scaler.loss_scale()
+-        scaled_loss.backward(retain_graph=retain_graph)
+-        if update_master_grads:
+-            self.update_master_grads()
+-
+-    def update_master_grads(self):
+-        # torch.cuda.nvtx.range_push("update_master_grads")
+-        """
+-        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
+-        the ``.grad`` attribute of the fp32 master parameters that are directly 
+-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+-        """
+-        # if self.dynamic_loss_scale:
+-        #     self._check_overflow()
+-        #     if self.overflow: return
+-        # self._model_grads_to_master_grads()
+-        # self._downscale_master()
+-        # Use the one-shot multi-tensor apply kernel
+-        self.loss_scaler.clear_overflow_state()
+-        if len(self.all_fp16_params) > 0:
+-            # print("Model grads before")
+-            # print([param.grad.data for param in self.all_fp16_params])
+-            # I'm ONLY writing this as an incremental way to make some tests pass until
+-            # I can refactor the tests as well.
+-            # FP16_Optimizer should not be used by anyone.
+-            model_grads = []
+-            master_grads = []
+-            for model_param, master_param in zip(self.all_fp16_params,
+-                                                 self.all_fp32_from_fp16_params):
+-                if model_param.grad is not None:
+-                    model_grads.append(model_param.grad)
+-                    if master_param.grad is None:
+-                        master_param.grad = torch.empty_like(master_param)
+-                    master_grads.append(master_param.grad)
+-            self.loss_scaler.unscale(
+-                model_grads,
+-                master_grads,
+-                self.loss_scaler.loss_scale())
+-            # print("Master grads after")
+-            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
+-        if len(self.all_fp32_from_fp32_params) > 0:
+-            model_grads = []
+-            master_grads = []
+-            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
+-                                                 self.all_fp32_from_fp32_params):
+-                if model_param.grad is not None:
+-                    model_grads.append(model_param.grad)
+-                    master_grads.append(master_param.grad)
+-            # print("Model grads before")
+-            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
+-            self.loss_scaler.unscale(
+-                model_grads,
+-                master_grads,
+-                self.loss_scaler.loss_scale())
+-            # print("Master grads after")
+-            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
+-        # quit()
+-        self.overflow = self.loss_scaler.update_scale()
+-        # torch.cuda.nvtx.range_pop()
+-
+-
+-    def inspect_master_grad_data(self):
+-        """
+-        When running with :class:`FP16_Optimizer`, 
+-        ``.grad`` attributes of a model's fp16 leaves should not be
+-        regarded as truthful, because they might be scaled.  
+-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+-        the fp32 master params' ``.grad``
+-        attributes will contain valid gradients properly divided by the loss scale.  However, 
+-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
+-        nonintuitive.  :attr:`inspect_master_grad_data`
+-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+-
+-        Returns:
+-            List of lists (one list for each parameter group).  The list for each parameter group
+-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
+-        """
+-        if self.overflow:
+-            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
+-                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
+-            return None
+-        else:
+-            # The optimizer owns only references to master params.
+-            master_grads_data = []
+-            for param_group in self.optimizer.param_groups:
+-                master_grads_this_group = []
+-                for param in param_group['params']:
+-                    if param.grad is not None:
+-                        master_grads_this_group.append(param.grad.data)
+-                    else:
+-                        master_grads_this_group.append(None)
+-                master_grads_data.append(master_grads_this_group)
+-            return master_grads_data
+-
+-
+-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+-    def _get_loss_scale(self):
+-        return self.loss_scaler.loss_scale()
+-
+-    def _set_loss_scale(self, value):
+-        self.loss_scaler._loss_scale = value
+-
+-    loss_scale = property(_get_loss_scale, _set_loss_scale)
+-
+-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+-    def _get_state(self):
+-        return self.optimizer.state
+-
+-    def _set_state(self, value):
+-        self.optimizer.state = value
+-
+-    state = property(_get_state, _set_state)
+-
+-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+-    # (for example, to adjust the learning rate)
+-    def _get_param_groups(self):
+-        return self.optimizer.param_groups
+-
+-    def _set_param_groups(self, value):
+-        self.optimizer.param_groups = value
+-
+-    param_groups = property(_get_param_groups, _set_param_groups)
+-
++import torch
++from torch import nn
++from torch.autograd import Variable
++from torch.nn.parameter import Parameter
++from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
++
++from ..amp._amp_state import _amp_state, maybe_print
++from ..amp.scaler import LossScaler
++from ..multi_tensor_apply import multi_tensor_applier
++from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
++
++# TODO:  Update overflow check + downscale to use Carl's fused kernel.
++class FP16_Optimizer(object):
++    def __init__(self, 
++                 init_optimizer, 
++                 static_loss_scale=1.0, 
++                 dynamic_loss_scale=False,
++                 dynamic_loss_args=None,
++                 verbose=True):
++        print("Warning:  FP16_Optimizer is deprecated and dangerous, and will be deleted soon.  "
++              "If it still works, you're probably getting lucky.  "
++              "For mixed precision, use the documented API https://nvidia.github.io/apex/amp.html, with opt_level=O1.")
++
++        if not torch.cuda.is_available:
++            raise SystemError("Cannot use fp16 without CUDA.")
++
++        self.verbose = verbose
++
++        self.optimizer = init_optimizer
++        # init_state_dict sets up an alternative way to cast per-param state tensors.
++        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
++        # init_state_dict = init_optimizer.state_dict()
++
++        self.fp16_groups = []
++        self.fp32_from_fp16_groups = []
++        self.fp32_from_fp32_groups = []
++        for i, param_group in enumerate(self.optimizer.param_groups):
++            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
++            fp16_params_this_group = []
++            fp32_params_this_group = []
++            fp32_from_fp16_params_this_group = []
++            for i, param in enumerate(param_group['params']):
++                if param.requires_grad:
++                    if param.type() == 'torch.cuda.HalfTensor':
++                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
++                                         .format(param.size()))
++                        fp16_params_this_group.append(param)
++                        master_param = param.detach().clone().float()
++                        master_param.requires_grad = True
++                        param_group['params'][i] = master_param
++                        fp32_from_fp16_params_this_group.append(master_param)
++                        # Reset existing state dict key to the new master param.
++                        # We still need to recast per-param state tensors, if any, to FP32.
++                        if param in self.optimizer.state:
++                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
++                    elif param.type() == 'torch.cuda.FloatTensor':
++                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
++                                         .format(param.size()))
++                        fp32_params_this_group.append(param)
++                        param_group['params'][i] = param
++                    else:
++                        raise TypeError("Wrapped parameters must be either "
++                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
++                                        "Received {}".format(param.type()))
++            
++            self.fp16_groups.append(fp16_params_this_group)
++            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
++            self.fp32_from_fp32_groups.append(fp32_params_this_group)
++
++        self.all_fp16_params = []
++        for group in self.fp16_groups:
++            self.all_fp16_params += group
++
++        self.all_fp32_from_fp16_params = []
++        for group in self.fp32_from_fp16_groups:
++            self.all_fp32_from_fp16_params += group
++
++        self.all_fp32_from_fp32_params = []
++        for group in self.fp32_from_fp32_groups:
++            self.all_fp32_from_fp32_params += group
++
++        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
++        self.optimizer.load_state_dict(self.optimizer.state_dict())
++        # alternative way to cast per-param state tensors:
++        # self.optimizer.load_state_dict(init_state_dict)
++
++        if dynamic_loss_scale:
++            self.dynamic_loss_scale = True
++            if dynamic_loss_args is not None:
++                self.loss_scaler = LossScaler("dynamic", **dynamic_loss_args)
++            else:
++                self.loss_scaler = LossScaler("dynamic")
++        else:
++            self.dynamic_loss_scale = False
++            self.loss_scaler = LossScaler(static_loss_scale)
++
++        self.overflow = False
++        self.first_closure_call_this_step = True
++
++        self.clip_grad_norm = clip_grad_norm
++
++        # TODO:  Centralize exposure and import error checking for the C backend.
++        if multi_tensor_applier.available:
++            import amp_C
++            self.multi_tensor_scale = amp_C.multi_tensor_scale
++            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
++
++    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
++    # of having to support FP16_Optimizer separately, for the time being.
++    def maybe_print(self, msg):
++        if self.verbose:
++            print(msg)
++            
++    def __getstate__(self):
++        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
++
++    def __setstate__(self, state):
++        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
++
++    def zero_grad(self, set_grads_to_None=False):
++        """
++        Zero fp32 and fp16 parameter grads.
++        """
++        # In principle, only the .grad attributes of the model params need to be zeroed,
++        # because gradients are copied into the FP32 master params.  However, we zero
++        # all gradients owned by the optimizer, just to be safe:
++        for group in self.optimizer.param_groups:
++             for p in group['params']:
++                 if set_grads_to_None:
++                     p.grad = None
++                 else:
++                     if p.grad is not None:
++                         p.grad.detach_()
++                         p.grad.zero_()
++
++        # Zero fp16 gradients owned by the model:
++        for fp16_group in self.fp16_groups:
++            for param in fp16_group:
++                if set_grads_to_None:
++                    param.grad = None
++                else:
++                    if param.grad is not None:
++                        param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
++                        param.grad.zero_()
++
++    # Should not be used anymore.
++    # def _check_overflow(self):
++    #     params = []
++    #     for group in self.fp16_groups:
++    #         for param in group:
++    #             params.append(param)
++    #     for group in self.fp32_from_fp32_groups:
++    #         for param in group:
++    #             params.append(param)
++    #     self.overflow = self.loss_scaler.has_overflow(params)
++
++    # def _update_scale(self, has_overflow=False):
++    #     self.loss_scaler.update_scale(has_overflow)
++
++    def _master_params_to_model_params(self):
++        if multi_tensor_applier.available:
++            if len(self.all_fp16_params) > 0:
++                multi_tensor_applier(
++                    self.multi_tensor_scale,
++                    self._dummy_overflow_buf,
++                    [self.all_fp32_from_fp16_params, self.all_fp16_params],
++                    1.0)
++        else:
++            for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
++                master_params_to_model_params(fp16_group, fp32_from_fp16_group)
++
++    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
++    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
++    # def _model_grads_to_master_grads(self):
++    #     for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
++    #         model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
++
++    # def _downscale_master(self):
++    #     if self.loss_scale != 1.0:
++    #         for group in self.optimizer.param_groups:
++    #             for param in group['params']:
++    #                 if param.grad is not None:
++    #                     param.grad.data.mul_(1./self.loss_scale)
++
++    def clip_master_grads(self, max_norm, norm_type=2):
++        """
++        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
++
++        Args:
++            max_norm (float or int): max norm of the gradients
++            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
++                infinity norm.
++
++        Returns:
++            Total norm of the current fp32 gradients (viewed as a single vector).
++
++        .. warning::
++            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
++        """
++        if not self.overflow:
++            fp32_params = []
++            for param_group in self.optimizer.param_groups:
++                for param in param_group['params']:
++                    fp32_params.append(param)
++            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
++        else:
++            return -1
++
++    def state_dict(self):
++        """
++        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
++        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
++        of the contained Pytorch optimizer.
++        Example::
++
++            checkpoint = {}
++            checkpoint['model'] = model.state_dict()
++            checkpoint['optimizer'] = optimizer.state_dict()
++            torch.save(checkpoint, "saved.pth")
++        """
++        state_dict = {}
++        state_dict['loss_scaler'] = self.loss_scaler
++        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
++        state_dict['overflow'] = self.overflow
++        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
++        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
++        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
++        return state_dict
++
++    def load_state_dict(self, state_dict):
++        """
++        Loads a state_dict created by an earlier call to state_dict(). 
++        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
++        whose parameters in turn came from ``model``, it is expected that the user 
++        will call ``model.load_state_dict()`` before
++        ``fp16_optimizer_instance.load_state_dict()`` is called.
++
++        Example::
++
++            model = torch.nn.Linear(D_in, D_out).cuda().half()
++            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
++            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
++            ...
++            checkpoint = torch.load("saved.pth")
++            model.load_state_dict(checkpoint['model'])
++            optimizer.load_state_dict(checkpoint['optimizer'])
++        """
++        # I think it should actually be ok to reload the optimizer before the model.
++        self.loss_scaler = state_dict['loss_scaler']
++        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
++        self.overflow = state_dict['overflow']
++        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
++        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
++        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
++        # The optimizer's hyperparameters and internal buffers are also up to date.  
++        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
++        # out of date.  There are two options.  
++        # 1:  Refresh the master params from the model's fp16 params.  
++        # This requires less storage but incurs precision loss.
++        # 2:  Save and restore the fp32 master copies separately.
++        # We choose option 2.
++        # 
++        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device 
++        # of their associated parameters, because it's possible those buffers might not exist yet in 
++        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been 
++        # constructed in the same way as the one whose state_dict we are loading, the same master params
++        # are guaranteed to exist, so we can just copy_() from the saved master params.
++        for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
++            for current, saved in zip(current_group, saved_group):
++                current.data.copy_(saved.data)
++
++    def step(self, closure=None): # could add clip option.
++        """
++        If no closure is supplied, :attr:`step` should be called after 
++        ``fp16_optimizer_obj.backward(loss)``.
++        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
++        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
++        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
++        another forward pass using their model.
++
++        If a closure is supplied, :attr:`step` may be called without a prior call to 
++        :attr:`backward(loss)`.
++        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
++        However, the user should take care that any ``loss.backward()`` call within the closure
++        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
++
++        Args:
++           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
++
++        Example with closure::
++
++            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an 
++            # existing pytorch optimizer.
++            for input, target in dataset:
++                def closure():
++                    optimizer.zero_grad()
++                    output = model(input)
++                    loss = loss_fn(output, target)
++                    # loss.backward() becomes:
++                    optimizer.backward(loss)
++                    return loss
++                optimizer.step(closure)
++
++        .. warning::
++            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
++
++        .. _`ordinary Pytorch optimizer use`:
++            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
++        """
++
++        scale = self.loss_scaler.loss_scale()
++        # To consider:  Should this be in step(), or update_master_grads?  It works either way,
++        # but I should make it consistent with the Amp control flow, which updates the scale
++        # during backward context manager exit.
++        # self._update_scale(self.overflow)
++
++        if self.overflow:
++            # Using _amp_state.maybe_print instead of self.print here is intentional.
++            maybe_print("Gradient overflow.  Skipping step, reducing " +
++                "loss scale to {}".format(self.loss_scaler.loss_scale()))
++            return
++        
++        if closure is not None:
++            retval = self._step_with_closure(closure)
++        else:
++            # torch.cuda.nvtx.range_push("pytorch optimizer step")
++            retval = self.optimizer.step()
++            # torch.cuda.nvtx.range_pop()
++
++        self._master_params_to_model_params()
++
++        return retval
++
++    def _step_with_closure(self, closure):
++        def wrapped_closure():
++            # helpful for debugging
++            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
++            #       .format(self.first_closure_call_this_step))
++            if self.first_closure_call_this_step:
++                # We expect that the fp16 params are initially fresh on entering self.step(),
++                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
++                # is called within self.optimizer.step().
++                self.first_closure_call_this_step = False
++            else:
++                # If self.optimizer.step() internally calls wrapped_closure more than once,
++                # it may update the fp32 params after each call.  However, self.optimizer 
++                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
++                # we can't rely on self.optimizer to refresh the fp16 params.  We need
++                # to handle that manually:
++                self._master_params_to_model_params()
++            # Our API expects the user to give us ownership of the backward() call by
++            # replacing all calls to loss.backward() with optimizer.backward(loss).
++            # This requirement holds whether or not the call to backward() is made within a closure.
++            # If the user is properly calling optimizer.backward(loss) within "closure," 
++            # calling closure() here will give the fp32 master params fresh gradients
++            # for the optimizer to play with, so all wrapped_closure needs to do is call 
++            # closure() and return the loss.
++            temp_loss = closure() 
++            while(self.overflow):
++                scale = self.loss_scaler.loss_scale()
++                # self._update_scale(self.overflow) # now done at the end of backward
++                print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
++                      self.loss_scaler.loss_scale()))
++                temp_loss = closure()
++            return temp_loss
++
++        retval = self.optimizer.step(wrapped_closure)
++
++        self.first_closure_call_this_step = True
++
++        return retval
++
++    def backward(self, loss, update_master_grads=True, retain_graph=False):
++        """ 
++        :attr:`backward` performs the following conceptual steps:
++
++        1. fp32_loss = loss.float() (see first Note below)
++        2. scaled_loss = fp32_loss*loss_scale
++        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
++        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
++        5. Finally, master grads are divided by loss_scale.
++
++        In this way, after :attr:`backward`, the master params have fresh gradients,
++        and :attr:`step` may be called.
++
++        .. note::
++            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
++            This provides some additional safety against overflow if the user has supplied an 
++            fp16 loss value.  
++            However, for maximum overflow safety, the user should
++            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to 
++            :attr:`backward`.
++
++        .. warning::
++            The gradients found in a model's leaves after the call to 
++            :attr:`backward` should not be regarded as valid in general, 
++            because it's possible 
++            they have been scaled (and in the case of dynamic loss scaling, 
++            the scale factor may change over time).  
++            If the user wants to inspect gradients after a call to :attr:`backward`,  
++            only the master gradients should be regarded as valid.  These can be retrieved via
++            :attr:`inspect_master_grad_data()`.
++
++        Args:
++            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
++            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
++            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
++
++        Example::
++
++            # Ordinary operation:
++            optimizer.backward(loss)
++
++            # Naive operation with multiple losses (technically valid, but less efficient):
++            # fp32 grads will be correct after the second call,  but 
++            # the first call incurs an unnecessary fp16->fp32 grad copy.
++            optimizer.backward(loss1)
++            optimizer.backward(loss2)
++
++            # More efficient way to handle multiple losses:
++            # The fp16->fp32 grad copy is delayed until fp16 grads from all 
++            # losses have been accumulated.
++            optimizer.backward(loss1, update_master_grads=False)
++            optimizer.backward(loss2, update_master_grads=False)
++            optimizer.update_master_grads()
++        """ 
++        # To consider:  try multiple backward passes using retain_grad=True to find 
++        # a loss scale that works.  After you find a loss scale that works, do a final dummy
++        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid 
++        # discarding the iteration,  but probably wouldn't improve overall efficiency.  
++        scaled_loss = loss.float()*self.loss_scaler.loss_scale()
++        scaled_loss.backward(retain_graph=retain_graph)
++        if update_master_grads:
++            self.update_master_grads()
++
++    def update_master_grads(self):
++        # torch.cuda.nvtx.range_push("update_master_grads")
++        """
++        Copy the ``.grad`` attribute from stored references to fp16 parameters to 
++        the ``.grad`` attribute of the fp32 master parameters that are directly 
++        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
++        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
++        """
++        # if self.dynamic_loss_scale:
++        #     self._check_overflow()
++        #     if self.overflow: return
++        # self._model_grads_to_master_grads()
++        # self._downscale_master()
++        # Use the one-shot multi-tensor apply kernel
++        self.loss_scaler.clear_overflow_state()
++        if len(self.all_fp16_params) > 0:
++            # print("Model grads before")
++            # print([param.grad.data for param in self.all_fp16_params])
++            # I'm ONLY writing this as an incremental way to make some tests pass until
++            # I can refactor the tests as well.
++            # FP16_Optimizer should not be used by anyone.
++            model_grads = []
++            master_grads = []
++            for model_param, master_param in zip(self.all_fp16_params,
++                                                 self.all_fp32_from_fp16_params):
++                if model_param.grad is not None:
++                    model_grads.append(model_param.grad)
++                    if master_param.grad is None:
++                        master_param.grad = torch.empty_like(master_param)
++                    master_grads.append(master_param.grad)
++            self.loss_scaler.unscale(
++                model_grads,
++                master_grads,
++                self.loss_scaler.loss_scale())
++            # print("Master grads after")
++            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
++        if len(self.all_fp32_from_fp32_params) > 0:
++            model_grads = []
++            master_grads = []
++            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
++                                                 self.all_fp32_from_fp32_params):
++                if model_param.grad is not None:
++                    model_grads.append(model_param.grad)
++                    master_grads.append(master_param.grad)
++            # print("Model grads before")
++            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
++            self.loss_scaler.unscale(
++                model_grads,
++                master_grads,
++                self.loss_scaler.loss_scale())
++            # print("Master grads after")
++            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
++        # quit()
++        self.overflow = self.loss_scaler.update_scale()
++        # torch.cuda.nvtx.range_pop()
++
++
++    def inspect_master_grad_data(self):
++        """
++        When running with :class:`FP16_Optimizer`, 
++        ``.grad`` attributes of a model's fp16 leaves should not be
++        regarded as truthful, because they might be scaled.  
++        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
++        the fp32 master params' ``.grad``
++        attributes will contain valid gradients properly divided by the loss scale.  However, 
++        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be 
++        nonintuitive.  :attr:`inspect_master_grad_data`
++        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
++
++        Returns:
++            List of lists (one list for each parameter group).  The list for each parameter group
++            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.                 
++        """
++        if self.overflow:
++            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
++                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
++            return None
++        else:
++            # The optimizer owns only references to master params.
++            master_grads_data = []
++            for param_group in self.optimizer.param_groups:
++                master_grads_this_group = []
++                for param in param_group['params']:
++                    if param.grad is not None:
++                        master_grads_this_group.append(param.grad.data)
++                    else:
++                        master_grads_this_group.append(None)
++                master_grads_data.append(master_grads_this_group)
++            return master_grads_data
++
++
++    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
++    def _get_loss_scale(self):
++        return self.loss_scaler.loss_scale()
++
++    def _set_loss_scale(self, value):
++        self.loss_scaler._loss_scale = value
++
++    loss_scale = property(_get_loss_scale, _set_loss_scale)
++
++    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
++    def _get_state(self):
++        return self.optimizer.state
++
++    def _set_state(self, value):
++        self.optimizer.state = value
++
++    state = property(_get_state, _set_state)
++
++    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
++    # (for example, to adjust the learning rate)
++    def _get_param_groups(self):
++        return self.optimizer.param_groups
++
++    def _set_param_groups(self, value):
++        self.optimizer.param_groups = value
++
++    param_groups = property(_get_param_groups, _set_param_groups)
++
+diff -Nur '--exclude=.git' apex/apex/optimizers/__init__.py apex-npu/apex/optimizers/__init__.py
+--- apex/apex/optimizers/__init__.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/optimizers/__init__.py	2021-06-17 07:10:45.389712070 +0000
+@@ -2,4 +2,10 @@
+ from .fused_adam import FusedAdam
+ from .fused_novograd import FusedNovoGrad
+ from .fused_lamb import FusedLAMB
+-from .fused_adagrad import FusedAdagrad
+\ No newline at end of file
++from .fused_adagrad import FusedAdagrad
++from .npu_fused_sgd import NpuFusedSGD
++from .npu_fused_adam import NpuFusedAdam
++from .npu_fused_bert_adam import NpuFusedBertAdam
++from .npu_fused_adadelta import NpuFusedAdadelta
++from .npu_fused_lamb import NpuFusedLamb
++from .lamb import Lamb
+\ No newline at end of file
+diff -Nur '--exclude=.git' apex/apex/parallel/LARC.py apex-npu/apex/parallel/LARC.py
+--- apex/apex/parallel/LARC.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/parallel/LARC.py	2021-06-17 07:10:45.389712070 +0000
+@@ -86,8 +86,16 @@
+                 for p in group['params']:
+                     if p.grad is None:
+                         continue
+-                    param_norm = torch.norm(p.data)
+-                    grad_norm = torch.norm(p.grad.data)
++                    # todo:restore this modification after torch.norm support any npu format
++                    format_id = p.storage().npu_format()
++                    if format_id != 0:
++                        p_cl = p.npu_format_cast(0)
++                        p_cl_grad = p.grad.npu_format_cast(0)
++                        param_norm = torch.norm(p_cl.data)
++                        grad_norm = torch.norm(p_cl_grad.data)
++                    else:
++                        param_norm = torch.norm(p.data)
++                        grad_norm = torch.norm(p.grad.data)
+ 
+                     if param_norm != 0 and grad_norm != 0:
+                         # calculate adaptive lr + weight decay
+diff -Nur '--exclude=.git' apex/apex/reparameterization/__init__.py apex-npu/apex/reparameterization/__init__.py
+--- apex/apex/reparameterization/__init__.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/reparameterization/__init__.py	2021-06-17 07:10:45.393712101 +0000
+@@ -1,127 +1,127 @@
+-from .weight_norm import WeightNorm
+-from .reparameterization import Reparameterization
+-
+-def apply_weight_norm(module, name='', dim=0, hook_child=True):
+-    r"""
+-    Applies weight normalization to a parameter in the given module.
+-    If no parameter is provided, applies weight normalization to all
+-    parameters in model (except 1-d vectors and scalars).
+-
+-    .. math::
+-         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+-
+-    Weight normalization is a reparameterization that decouples the magnitude
+-    of a weight tensor from its direction. This replaces the parameter specified
+-    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
+-    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
+-    Weight normalization is implemented via a hook that recomputes the weight
+-    tensor from the magnitude and direction before every :meth:`~Module.forward`
+-    call.
+-
+-    By default, with `dim=0`, the norm is computed independently per output
+-    channel/plane. To compute a norm over the entire weight tensor, use
+-    `dim=None`.
+-
+-    See https://arxiv.org/abs/1602.07868
+-
+-    Args:
+-        module (nn.Module): containing module
+-        name (str, optional): name of weight parameter
+-        dim (int, optional): dimension over which to compute the norm
+-        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
+-            parameters. If False, it's added to `module` instead. Default: True
+-
+-    Returns:
+-        The original module with the weight norm hook
+-
+-    Example::
+-
+-        >>> m = apply_weight_norm(nn.Linear(20, 40), name='weight')
+-        Linear (20 -> 40)
+-        >>> m.weight_g.size()
+-        torch.Size([40, 1])
+-        >>> m.weight_v.size()
+-        torch.Size([40, 20])
+-
+-    """
+-    return apply_reparameterization(module, reparameterization=WeightNorm, hook_child=hook_child,
+-                                    name=name, dim=dim)
+-
+-def remove_weight_norm(module, name='', remove_all=False):
+-    """
+-    Removes the weight normalization reparameterization of a parameter from a module.
+-    If no parameter is supplied then all weight norm parameterizations are removed.
+-    Args:
+-        module (nn.Module): containing module
+-        name (str, optional): name of weight parameter
+-    Example:
+-        >>> m = apply_weight_norm(nn.Linear(20, 40))
+-        >>> remove_weight_norm(m)
+-    """
+-    return remove_reparameterization(module, reparameterization=WeightNorm,
+-                                    name=name, remove_all=remove_all)
+-
+-def apply_reparameterization(module, reparameterization=None, name='', dim=0, hook_child=True):
+-    """
+-    Applies a given weight reparameterization (such as weight normalization) to
+-    a parameter in the given module. If no parameter is given, applies the reparameterization
+-    to all parameters in model (except 1-d vectors and scalars).
+-
+-    Args:
+-        module (nn.Module): containing module
+-        reparameterization (Reparameterization): reparamaterization class to apply
+-        name (str, optional): name of weight parameter
+-        dim (int, optional): dimension over which to perform reparameterization op
+-        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
+-            parameters. If False, it's added to `module` instead. Default: True
+-
+-    Returns:
+-        The original module with the reparameterization hook
+-
+-    Example::
+-
+-        >>> m = apply_reparameterization(nn.Linear(20, 40), WeightNorm)
+-        Linear (20 -> 40)
+-
+-    """
+-    assert reparameterization is not None
+-    if name != '':
+-        Reparameterization.apply(module, name, dim, reparameterization, hook_child)
+-    else:
+-        names = list(module.state_dict().keys())
+-        for name in names:
+-            apply_reparameterization(module, reparameterization, name, dim, hook_child)
+-    return module
+-
+-def remove_reparameterization(module, reparameterization=Reparameterization,
+-                                name='', remove_all=False):
+-    """
+-    Removes the given reparameterization of a parameter from a module.
+-    If no parameter is supplied then all reparameterizations are removed.
+-    Args:
+-        module (nn.Module): containing module
+-        reparameterization (Reparameterization): reparamaterization class to apply
+-        name (str, optional): name of weight parameter
+-        remove_all (bool, optional): if True, remove all reparamaterizations of given type. Default: False
+-    Example:
+-        >>> m = apply_reparameterization(nn.Linear(20, 40),WeightNorm)
+-        >>> remove_reparameterization(m)
+-    """
+-    if name != '' or remove_all:
+-        to_remove = []
+-        for k, hook in module._forward_pre_hooks.items():
+-            if isinstance(hook, reparameterization) and (hook.name == name or remove_all):
+-                hook.remove(module)
+-                to_remove.append(k)
+-        if len(to_remove) > 0:
+-            for k in to_remove:
+-                del module._forward_pre_hooks[k]
+-            return module
+-        if not remove_all:
+-            raise ValueError("reparameterization of '{}' not found in {}"
+-                             .format(name, module))
+-    else:
+-        modules = [module]+[x for x in module.modules()]
+-        for m in modules:
+-            remove_reparameterization(m, reparameterization=reparameterization, remove_all=True)
+-        return module
++from .weight_norm import WeightNorm
++from .reparameterization import Reparameterization
++
++def apply_weight_norm(module, name='', dim=0, hook_child=True):
++    r"""
++    Applies weight normalization to a parameter in the given module.
++    If no parameter is provided, applies weight normalization to all
++    parameters in model (except 1-d vectors and scalars).
++
++    .. math::
++         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
++
++    Weight normalization is a reparameterization that decouples the magnitude
++    of a weight tensor from its direction. This replaces the parameter specified
++    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
++    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
++    Weight normalization is implemented via a hook that recomputes the weight
++    tensor from the magnitude and direction before every :meth:`~Module.forward`
++    call.
++
++    By default, with `dim=0`, the norm is computed independently per output
++    channel/plane. To compute a norm over the entire weight tensor, use
++    `dim=None`.
++
++    See https://arxiv.org/abs/1602.07868
++
++    Args:
++        module (nn.Module): containing module
++        name (str, optional): name of weight parameter
++        dim (int, optional): dimension over which to compute the norm
++        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
++            parameters. If False, it's added to `module` instead. Default: True
++
++    Returns:
++        The original module with the weight norm hook
++
++    Example::
++
++        >>> m = apply_weight_norm(nn.Linear(20, 40), name='weight')
++        Linear (20 -> 40)
++        >>> m.weight_g.size()
++        torch.Size([40, 1])
++        >>> m.weight_v.size()
++        torch.Size([40, 20])
++
++    """
++    return apply_reparameterization(module, reparameterization=WeightNorm, hook_child=hook_child,
++                                    name=name, dim=dim)
++
++def remove_weight_norm(module, name='', remove_all=False):
++    """
++    Removes the weight normalization reparameterization of a parameter from a module.
++    If no parameter is supplied then all weight norm parameterizations are removed.
++    Args:
++        module (nn.Module): containing module
++        name (str, optional): name of weight parameter
++    Example:
++        >>> m = apply_weight_norm(nn.Linear(20, 40))
++        >>> remove_weight_norm(m)
++    """
++    return remove_reparameterization(module, reparameterization=WeightNorm,
++                                    name=name, remove_all=remove_all)
++
++def apply_reparameterization(module, reparameterization=None, name='', dim=0, hook_child=True):
++    """
++    Applies a given weight reparameterization (such as weight normalization) to
++    a parameter in the given module. If no parameter is given, applies the reparameterization
++    to all parameters in model (except 1-d vectors and scalars).
++
++    Args:
++        module (nn.Module): containing module
++        reparameterization (Reparameterization): reparamaterization class to apply
++        name (str, optional): name of weight parameter
++        dim (int, optional): dimension over which to perform reparameterization op
++        hook_child (boolean, optional): adds reparameterization hook to direct parent of the 
++            parameters. If False, it's added to `module` instead. Default: True
++
++    Returns:
++        The original module with the reparameterization hook
++
++    Example::
++
++        >>> m = apply_reparameterization(nn.Linear(20, 40), WeightNorm)
++        Linear (20 -> 40)
++
++    """
++    assert reparameterization is not None
++    if name != '':
++        Reparameterization.apply(module, name, dim, reparameterization, hook_child)
++    else:
++        names = list(module.state_dict().keys())
++        for name in names:
++            apply_reparameterization(module, reparameterization, name, dim, hook_child)
++    return module
++
++def remove_reparameterization(module, reparameterization=Reparameterization,
++                                name='', remove_all=False):
++    """
++    Removes the given reparameterization of a parameter from a module.
++    If no parameter is supplied then all reparameterizations are removed.
++    Args:
++        module (nn.Module): containing module
++        reparameterization (Reparameterization): reparamaterization class to apply
++        name (str, optional): name of weight parameter
++        remove_all (bool, optional): if True, remove all reparamaterizations of given type. Default: False
++    Example:
++        >>> m = apply_reparameterization(nn.Linear(20, 40),WeightNorm)
++        >>> remove_reparameterization(m)
++    """
++    if name != '' or remove_all:
++        to_remove = []
++        for k, hook in module._forward_pre_hooks.items():
++            if isinstance(hook, reparameterization) and (hook.name == name or remove_all):
++                hook.remove(module)
++                to_remove.append(k)
++        if len(to_remove) > 0:
++            for k in to_remove:
++                del module._forward_pre_hooks[k]
++            return module
++        if not remove_all:
++            raise ValueError("reparameterization of '{}' not found in {}"
++                             .format(name, module))
++    else:
++        modules = [module]+[x for x in module.modules()]
++        for m in modules:
++            remove_reparameterization(m, reparameterization=reparameterization, remove_all=True)
++        return module
+diff -Nur '--exclude=.git' apex/apex/reparameterization/reparameterization.py apex-npu/apex/reparameterization/reparameterization.py
+--- apex/apex/reparameterization/reparameterization.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/reparameterization/reparameterization.py	2021-06-17 07:10:45.393712101 +0000
+@@ -1,151 +1,151 @@
+-import torch
+-from torch.nn.parameter import Parameter
+-import sys
+-class Reparameterization(object):
+-    """
+-    Class interface for performing weight reparameterizations
+-    Arguments:
+-        name (str): name of weight parameter
+-        dim (int): dimension over which to compute the norm
+-        module (nn.Module): parent module to which param `name` is registered to
+-        retain_forward (bool, optional): if False deletes weight on call to 
+-            module.backward. Used to avoid memory leaks with DataParallel Default: True
+-    Attributes:
+-        reparameterization_names (list, str): contains names of all parameters 
+-            needed to compute reparameterization.
+-        backward_hook_key (int): torch.utils.hooks.RemovableHandle.id for hook used in module backward pass.
+-    """
+-
+-    def __init__(self, name, dim, module, retain_forward=True):
+-        self.name = name
+-        self.dim = dim
+-        self.evaluated = False
+-        self.retain_forward = retain_forward
+-        self.reparameterization_names = []
+-        self.backward_hook_key = None
+-        self.module = module
+-
+-    def compute_weight(self, module=None, name=None):
+-        """
+-        Computes reparameterized weight value to assign value to module attribute
+-        with name `name`.
+-        See WeightNorm class for example.
+-        Arguments:
+-            module (nn.Module): module with weight we'd like to reparameterize
+-        Returns:
+-            w (Tensor): Tensor object containing value of reparameterized weight
+-        """
+-        raise NotImplementedError
+-
+-    def reparameterize(self, name, weight, dim):
+-        """
+-        Creates Parameters to be used for reparameterization and creates names that
+-        for attributes for the module these Parameters will correspond to.
+-        The parameters will be registered according to the names provided.
+-        See WeightNorm class for example.
+-        Arguments:
+-            module (nn.Module): module with weight we'd like to reparameterize
+-            name (str, optional): name of weight parameter
+-            dim (int, optional): dimension over which to compute parameterization
+-        Returns:
+-            names (list, str): names of Parameters to be used for reparameterization
+-            params (list, Parameter): Parameters to be used for reparameterization
+-        """
+-        raise NotImplementedError
+-
+-    @staticmethod
+-    def apply(module, name, dim, reparameterization=None, hook_child=True):
+-        """
+-        Applies reparametrization to module's `name` parameter and modifies instance attributes as appropriate.
+-        `hook_child` adds reparameterization hook to direct parent of the parameters. If False, it's added to `module` instead.
+-        """
+-        if reparameterization is None:
+-            reparameterization = Reparameterization
+-        module2use, name2use = Reparameterization.get_module_and_name(module, name)
+-        # does not work on sparse
+-        if name2use is None or isinstance(module2use, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
+-            return
+-
+-        if hook_child:
+-            fn = reparameterization(name2use, dim, module2use)
+-        else:
+-            fn = reparameterization(name, dim, module)
+-
+-        weight = getattr(module2use, name2use)
+-        if weight.dim() <= 1:
+-            return
+-
+-        # remove weight from parameter list
+-        del module2use._parameters[name2use]
+-
+-        # add parameters of reparameterization of parameter to module
+-        names, params = fn.reparameterize(name2use, weight, dim)
+-        for n, p in zip(names, params):
+-            module2use.register_parameter(n, p)
+-
+-        # add parameters to reparameterization so they can be removed later
+-        fn.reparameterization_names = names
+-
+-        setattr(module2use, name2use, None)
+-
+-        hook_module = module2use
+-        if not hook_child:
+-            hook_module = module
+-        # recompute weight before every forward()
+-        hook_module.register_forward_pre_hook(fn)
+-
+-        # remove weight during backward
+-        handle = hook_module.register_backward_hook(fn.backward_hook)
+-        # get hook key so we can delete it later
+-        fn.backward_hook_key = handle.id
+-
+-        return fn
+-
+-    @staticmethod
+-    def get_module_and_name(module, name):
+-        """
+-        recursively fetches (possible) child module and name of weight to be reparameterized
+-        """
+-        name2use = None
+-        module2use = None
+-        names = name.split('.')
+-        if len(names) == 1 and names[0] != '':
+-            name2use = names[0]
+-            module2use = module
+-        elif len(names) > 1:
+-            module2use = module
+-            name2use = names[0]
+-            for i in range(len(names)-1):
+-                module2use = getattr(module2use, name2use)
+-                name2use = names[i+1]
+-        return module2use, name2use
+-
+-    def get_params(self, module):
+-        """gets params of reparameterization based on known attribute names"""
+-        return [getattr(module, n) for n in self.reparameterization_names]
+-
+-    def remove(self, module):
+-        """removes reparameterization and backward hook (does not remove forward hook)"""
+-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+-        for p in self.get_params(module2use):
+-            p.requires_grad = False
+-        weight = self.compute_weight(module2use, name2use)
+-        delattr(module2use, name2use)
+-        for n in self.reparameterization_names:
+-            del module2use._parameters[n]
+-        module2use.register_parameter(name2use, Parameter(weight.data))
+-        del module._backward_hooks[self.backward_hook_key]
+-
+-    def __call__(self, module, inputs):
+-        """callable hook for forward pass"""
+-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+-        _w = getattr(module2use, name2use)
+-        if not self.evaluated or _w is None:
+-            setattr(module2use, name2use, self.compute_weight(module2use, name2use))
+-            self.evaluated = True
+-
+-    def backward_hook(self, module, grad_input, grad_output):
+-        """callable hook for backward pass"""
+-        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
+-        wn = getattr(module2use, name2use)
+-        self.evaluated = False
++import torch
++from torch.nn.parameter import Parameter
++import sys
++class Reparameterization(object):
++    """
++    Class interface for performing weight reparameterizations
++    Arguments:
++        name (str): name of weight parameter
++        dim (int): dimension over which to compute the norm
++        module (nn.Module): parent module to which param `name` is registered to
++        retain_forward (bool, optional): if False deletes weight on call to 
++            module.backward. Used to avoid memory leaks with DataParallel Default: True
++    Attributes:
++        reparameterization_names (list, str): contains names of all parameters 
++            needed to compute reparameterization.
++        backward_hook_key (int): torch.utils.hooks.RemovableHandle.id for hook used in module backward pass.
++    """
++
++    def __init__(self, name, dim, module, retain_forward=True):
++        self.name = name
++        self.dim = dim
++        self.evaluated = False
++        self.retain_forward = retain_forward
++        self.reparameterization_names = []
++        self.backward_hook_key = None
++        self.module = module
++
++    def compute_weight(self, module=None, name=None):
++        """
++        Computes reparameterized weight value to assign value to module attribute
++        with name `name`.
++        See WeightNorm class for example.
++        Arguments:
++            module (nn.Module): module with weight we'd like to reparameterize
++        Returns:
++            w (Tensor): Tensor object containing value of reparameterized weight
++        """
++        raise NotImplementedError
++
++    def reparameterize(self, name, weight, dim):
++        """
++        Creates Parameters to be used for reparameterization and creates names that
++        for attributes for the module these Parameters will correspond to.
++        The parameters will be registered according to the names provided.
++        See WeightNorm class for example.
++        Arguments:
++            module (nn.Module): module with weight we'd like to reparameterize
++            name (str, optional): name of weight parameter
++            dim (int, optional): dimension over which to compute parameterization
++        Returns:
++            names (list, str): names of Parameters to be used for reparameterization
++            params (list, Parameter): Parameters to be used for reparameterization
++        """
++        raise NotImplementedError
++
++    @staticmethod
++    def apply(module, name, dim, reparameterization=None, hook_child=True):
++        """
++        Applies reparametrization to module's `name` parameter and modifies instance attributes as appropriate.
++        `hook_child` adds reparameterization hook to direct parent of the parameters. If False, it's added to `module` instead.
++        """
++        if reparameterization is None:
++            reparameterization = Reparameterization
++        module2use, name2use = Reparameterization.get_module_and_name(module, name)
++        # does not work on sparse
++        if name2use is None or isinstance(module2use, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
++            return
++
++        if hook_child:
++            fn = reparameterization(name2use, dim, module2use)
++        else:
++            fn = reparameterization(name, dim, module)
++
++        weight = getattr(module2use, name2use)
++        if weight.dim() <= 1:
++            return
++
++        # remove weight from parameter list
++        del module2use._parameters[name2use]
++
++        # add parameters of reparameterization of parameter to module
++        names, params = fn.reparameterize(name2use, weight, dim)
++        for n, p in zip(names, params):
++            module2use.register_parameter(n, p)
++
++        # add parameters to reparameterization so they can be removed later
++        fn.reparameterization_names = names
++
++        setattr(module2use, name2use, None)
++
++        hook_module = module2use
++        if not hook_child:
++            hook_module = module
++        # recompute weight before every forward()
++        hook_module.register_forward_pre_hook(fn)
++
++        # remove weight during backward
++        handle = hook_module.register_backward_hook(fn.backward_hook)
++        # get hook key so we can delete it later
++        fn.backward_hook_key = handle.id
++
++        return fn
++
++    @staticmethod
++    def get_module_and_name(module, name):
++        """
++        recursively fetches (possible) child module and name of weight to be reparameterized
++        """
++        name2use = None
++        module2use = None
++        names = name.split('.')
++        if len(names) == 1 and names[0] != '':
++            name2use = names[0]
++            module2use = module
++        elif len(names) > 1:
++            module2use = module
++            name2use = names[0]
++            for i in range(len(names)-1):
++                module2use = getattr(module2use, name2use)
++                name2use = names[i+1]
++        return module2use, name2use
++
++    def get_params(self, module):
++        """gets params of reparameterization based on known attribute names"""
++        return [getattr(module, n) for n in self.reparameterization_names]
++
++    def remove(self, module):
++        """removes reparameterization and backward hook (does not remove forward hook)"""
++        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
++        for p in self.get_params(module2use):
++            p.requires_grad = False
++        weight = self.compute_weight(module2use, name2use)
++        delattr(module2use, name2use)
++        for n in self.reparameterization_names:
++            del module2use._parameters[n]
++        module2use.register_parameter(name2use, Parameter(weight.data))
++        del module._backward_hooks[self.backward_hook_key]
++
++    def __call__(self, module, inputs):
++        """callable hook for forward pass"""
++        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
++        _w = getattr(module2use, name2use)
++        if not self.evaluated or _w is None:
++            setattr(module2use, name2use, self.compute_weight(module2use, name2use))
++            self.evaluated = True
++
++    def backward_hook(self, module, grad_input, grad_output):
++        """callable hook for backward pass"""
++        module2use, name2use = Reparameterization.get_module_and_name(module, self.name)
++        wn = getattr(module2use, name2use)
++        self.evaluated = False
+diff -Nur '--exclude=.git' apex/apex/reparameterization/weight_norm.py apex-npu/apex/reparameterization/weight_norm.py
+--- apex/apex/reparameterization/weight_norm.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/apex/reparameterization/weight_norm.py	2021-06-17 07:10:45.393712101 +0000
+@@ -1,78 +1,78 @@
+-import torch
+-from torch.nn.parameter import Parameter
+-from ..fp16_utils import Fused_Weight_Norm
+-import time
+-
+-from .reparameterization import Reparameterization
+-
+-def _norm(p, dim):
+-    """Computes the norm over all dimensions except dim"""
+-    if dim is None:
+-        return p.norm()
+-    elif dim == 0:
+-        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
+-        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
+-    elif dim == p.dim() - 1:
+-        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
+-        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
+-    return _norm(p.transpose(0, dim), 0).transpose(0, dim)
+-
+-HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
+-
+-class WeightNorm(Reparameterization):
+-    r"""
+-    Weight normalization is a reparameterization that decouples the magnitude
+-    of a weight tensor from its direction. This replaces the parameter specified
+-    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
+-    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
+-    Weight normalization is implemented via a hook that recomputes the weight
+-    tensor from the magnitude and direction before every :meth:`~Module.forward`
+-    call.
+-
+-    .. math::
+-         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+-
+-    By default, with `dim=0`, the norm is computed independently per output
+-    channel/plane. To compute a norm over the entire weight tensor, use
+-    `dim=None`.
+-    """
+-    def compute_weight(self, module=None, name=None):
+-        """
+-        Computes weight normalized weight value to assign value to module attribute
+-        with name `name`.
+-        Arguments:
+-            module (nn.Module): module with weight we'd like to reparameterize
+-        Returns:
+-            w (Tensor): Tensor object containing value of reparameterized weight
+-        """
+-        if module is None:
+-            module = self.module
+-        if name is None:
+-            name = self.name
+-        module, name = Reparameterization.get_module_and_name(module, name)
+-        g = getattr(module, name + '_g')
+-        v = getattr(module, name + '_v')
+-
+-        fused_weight_norm = Fused_Weight_Norm.apply
+-        v = v.contiguous()
+-        w = fused_weight_norm(v, g, self.dim)
+-
+-        return w
+-
+-    def reparameterize(self, name, weight, dim):
+-        """
+-        Creates Parameters v and gto be used for weight normalization
+-        and creates names that for attributes for the module these Parameters
+-        will correspond to. The parameters will be registered according to the names
+-        provided.
+-        Arguments:
+-            module (nn.Module): module with weight we'd like to reparameterize
+-            name (str, optional): name of weight parameter
+-            dim (int, optional): dimension over which to compute parameterization
+-        Returns:
+-            names (list, str): names of Parameters to be used for reparameterization
+-            params (list, Parameter): Parameters to be used for reparameterization
+-        """
+-        names = [name + '_g', name + '_v']
+-        params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
+-        return names, params
++import torch
++from torch.nn.parameter import Parameter
++from ..fp16_utils import Fused_Weight_Norm
++import time
++
++from .reparameterization import Reparameterization
++
++def _norm(p, dim):
++    """Computes the norm over all dimensions except dim"""
++    if dim is None:
++        return p.norm()
++    elif dim == 0:
++        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
++        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
++    elif dim == p.dim() - 1:
++        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
++        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
++    return _norm(p.transpose(0, dim), 0).transpose(0, dim)
++
++HALF_TYPES = (torch.cuda.HalfTensor, torch.HalfTensor)
++
++class WeightNorm(Reparameterization):
++    r"""
++    Weight normalization is a reparameterization that decouples the magnitude
++    of a weight tensor from its direction. This replaces the parameter specified
++    by `name` (e.g. "weight") with two parameters: one specifying the magnitude
++    (e.g. "weight_g") and one specifying the direction (e.g. "weight_v").
++    Weight normalization is implemented via a hook that recomputes the weight
++    tensor from the magnitude and direction before every :meth:`~Module.forward`
++    call.
++
++    .. math::
++         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
++
++    By default, with `dim=0`, the norm is computed independently per output
++    channel/plane. To compute a norm over the entire weight tensor, use
++    `dim=None`.
++    """
++    def compute_weight(self, module=None, name=None):
++        """
++        Computes weight normalized weight value to assign value to module attribute
++        with name `name`.
++        Arguments:
++            module (nn.Module): module with weight we'd like to reparameterize
++        Returns:
++            w (Tensor): Tensor object containing value of reparameterized weight
++        """
++        if module is None:
++            module = self.module
++        if name is None:
++            name = self.name
++        module, name = Reparameterization.get_module_and_name(module, name)
++        g = getattr(module, name + '_g')
++        v = getattr(module, name + '_v')
++
++        fused_weight_norm = Fused_Weight_Norm.apply
++        v = v.contiguous()
++        w = fused_weight_norm(v, g, self.dim)
++
++        return w
++
++    def reparameterize(self, name, weight, dim):
++        """
++        Creates Parameters v and gto be used for weight normalization
++        and creates names that for attributes for the module these Parameters
++        will correspond to. The parameters will be registered according to the names
++        provided.
++        Arguments:
++            module (nn.Module): module with weight we'd like to reparameterize
++            name (str, optional): name of weight parameter
++            dim (int, optional): dimension over which to compute parameterization
++        Returns:
++            names (list, str): names of Parameters to be used for reparameterization
++            params (list, Parameter): Parameters to be used for reparameterization
++        """
++        names = [name + '_g', name + '_v']
++        params = [Parameter(_norm(weight, dim).data), Parameter(weight.data)]
++        return names, params
+diff -Nur '--exclude=.git' apex/setup.py apex-npu/setup.py
+--- apex/setup.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/setup.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import torch
+ from torch.utils import cpp_extension
+ from setuptools import setup, find_packages
+@@ -6,6 +22,7 @@
+ import sys
+ import warnings
+ import os
++import glob
+ 
+ # ninja build does not work unless include_dirs are abs path
+ this_dir = os.path.dirname(os.path.abspath(__file__))
+@@ -32,11 +49,7 @@
+           'If you wish to cross-compile for a single specific architecture,\n'
+           'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
+     if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+-        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+-        if int(bare_metal_major) == 11:
+-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+-        else:
+-            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
++        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+ 
+ print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+ TORCH_MAJOR = int(torch.__version__.split('.')[0])
+@@ -50,6 +63,10 @@
+ ext_modules = []
+ 
+ extras = {}
++
++secure_compile_args = ['-fPIE', '-fPIC', '-fstack-protector-all', '-Wall']
++secure_link_args = ['-Wl,-z,now', '-Wl,-z,relro', '-Wl,-z,noexecstack', '-s']
++
+ if "--pyprof" in sys.argv:
+     string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
+              "soon be removed from Apex.  Please visit\n" + \
+@@ -78,7 +95,15 @@
+     sys.argv.remove("--cpp_ext")
+     ext_modules.append(
+         CppExtension('apex_C',
+-                     ['csrc/flatten_unflatten.cpp',]))
++                     ['csrc/flatten_unflatten.cpp',],
++                     extra_compile_args=secure_compile_args,
++                     extra_link_args=secure_link_args))
++
++    ext_modules.append(
++        CppExtension('change_data_ptr',
++                     ['csrc/combine_tensors/change_dataptr.cpp',],
++                     extra_compile_args=secure_compile_args,
++                     extra_link_args=secure_link_args))
+ 
+ def get_cuda_bare_metal_version(cuda_dir):
+     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+@@ -192,6 +217,21 @@
+                           extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                               'nvcc':['-O3'] + version_dependent_macros}))
+ 
++if "--npu_float_status" in sys.argv:
++    from torch.utils.cpp_extension import CppExtension
++    sys.argv.remove("--npu_float_status")
++
++    from torch.utils.cpp_extension import BuildExtension
++    cmdclass['build_ext'] = BuildExtension
++
++    sources = glob.glob(os.path.join(this_dir, 'csrc/npu_float_status', '*.cpp'))
++
++    ext_modules.append(
++        CppExtension(name='npu_float_status',
++                     sources=sources,
++                     extra_compile_args=secure_compile_args,
++                     extra_link_args=secure_link_args))
++
+ if "--bnp" in sys.argv:
+     from torch.utils.cpp_extension import CUDAExtension
+     sys.argv.remove("--bnp")
+@@ -404,7 +444,7 @@
+ 
+ setup(
+     name='apex',
+-    version='0.1',
++    version='0.1+ascend',
+     packages=find_packages(exclude=('build',
+                                     'csrc',
+                                     'include',
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_add_param_group.py apex-npu/tests/L0/run_amp/test_add_param_group.py
+--- apex/tests/L0/run_amp/test_add_param_group.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_add_param_group.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ import functools as ft
+@@ -9,16 +25,20 @@
+ from torch import nn
+ import torch.nn.functional as F
+ from torch.nn import Parameter
++import numpy as np
+ 
+-from utils import common_init, HALF, FLOAT,\
+-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
++from utils import common_init
++import sys
++sys.path.append('../')
++import device
+ 
+ class MyModel(torch.nn.Module):
+     def __init__(self, unique):
+         super(MyModel, self).__init__()
+         self.weight0 = Parameter(unique +
+-            torch.arange(2, device='cuda', dtype=torch.float32))
+-        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
++            torch.from_numpy(np.arange(2, dtype=np.float32)))
++        self.weight1 = Parameter(1. + unique +
++            torch.from_numpy(np.arange(2, dtype=np.float16)).to(device.CALCULATE_DEVICE ))
+ 
+     @staticmethod
+     def ops(input, weight0, weight1):
+@@ -33,7 +53,8 @@
+ 
+ class TestAddParamGroup(unittest.TestCase):
+     def setUp(self):
+-        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
++        self.device = device.CALCULATE_DEVICE
++        self.x = torch.ones((2), device=self.device, dtype=torch.float32)
+         common_init(self)
+ 
+     def tearDown(self):
+@@ -54,8 +75,8 @@
+         for opt_level in ("O0", "O1", "O2", "O3"):
+           for zero_before_add in (True, False):
+             for try_accumulation in (True, False):
+-              model0 = MyModel(1)
+-              model1 = MyModel(2)
++              model0 = MyModel(1).to(self.device)
++              model1 = MyModel(2).to(self.device)
+ 
+               optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                           momentum=0.125)
+@@ -89,8 +110,8 @@
+                                  [param.data.clone() for param in model1.parameters()]
+ 
+               for how_to_zero in "none", "model", "optimizer":
+-                  model0 = MyModel(1)
+-                  model1 = MyModel(2)
++                  model0 = MyModel(1).to(self.device)
++                  model1 = MyModel(2).to(self.device)
+ 
+                   optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                               momentum=0.125)
+@@ -139,7 +160,8 @@
+                                  [param.data.clone() for param in model1.parameters()]
+ 
+                   for reference, final in zip(reference_params, final_params):
+-                      self.assertTrue(torch.allclose(reference.to(final.dtype), final),
++                      final = final.to(torch.float32)
++                      self.assertTrue(torch.allclose(reference.to(final.dtype).to('cpu'), final.to('cpu')),
+                                       "opt_level = {}, how_to_zero = {}, zero_before_add = {}".format(
+                                       opt_level, how_to_zero, zero_before_add))
+ 
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_basic_casts.py apex-npu/tests/L0/run_amp/test_basic_casts.py
+--- apex/tests/L0/run_amp/test_basic_casts.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_basic_casts.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ import functools as ft
+@@ -7,73 +23,89 @@
+ import torch
+ from torch import nn
+ import torch.nn.functional as F
++import numpy as np
++
++from utils import common_init, generate_data
++import utils
++
++import sys
++sys.path.append('../')
++import device
++
++npu_input_grad = None
+ 
+-from utils import common_init, HALF, FLOAT,\
+-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
++def npu_input_grad_hook(grad):
++   global npu_input_grad
++   npu_input_grad = grad.to('cpu')
+ 
+ def run_layer_test(test_case, fns, expected, input_shape, test_backward=True):
+     for fn, typ in it.product(fns, expected.keys()):
+-        x = torch.randn(input_shape, dtype=typ).requires_grad_()
++        x = generate_data(0, 10, input_shape, typ).requires_grad_()
++        x = x.to(test_case.device)
++        x.register_hook(npu_input_grad_hook)
+         y = fn(x)
+         test_case.assertEqual(y.type(), expected[typ])
+         if test_backward:
+             y.float().sum().backward()
+-            test_case.assertEqual(x.grad.type(), MATCH_INPUT[typ])
++            test_case.assertEqual(npu_input_grad.type().split(".")[-1], utils.MATCH_INPUT[typ].split(".")[-1])
+ 
+ class TestBasicCasts(unittest.TestCase):
+     def setUp(self):
+         self.handle = amp.init(enabled=True)
++        self.device = device.CALCULATE_DEVICE
+         common_init(self)
+ 
+     def tearDown(self):
+         self.handle._deactivate()
+ 
+     def test_linear_is_half(self):
+-        m = nn.Linear(self.h, self.h)
++        m = nn.Linear(self.h, self.h).to(self.device)
+         f = ft.partial(F.linear, weight=m.weight, bias=m.bias)
+-        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.h))
++        run_layer_test(self, [m, f], utils.ALWAYS_HALF, (self.b, self.h))
+ 
+     def test_conv2d_is_half(self):
+-        m = nn.Conv2d(self.c, self.c, self.k)
++        m = nn.Conv2d(self.c, self.c, self.k).to(self.device)
+         f = ft.partial(F.conv2d, weight=m.weight, bias=m.bias)
+-        run_layer_test(self, [m, f], ALWAYS_HALF, (self.b, self.c, self.h, self.h))
++        run_layer_test(self, [m, f], utils.ALWAYS_HALF, (self.b, self.c, self.h, self.h))
+ 
+     def test_softmax_is_float(self):
+-        m = nn.Softmax(dim=1)
++        m = nn.Softmax(dim=1).to(self.device)
+         f = ft.partial(F.softmax, dim=1)
+-        run_layer_test(self, [m, f], ALWAYS_FLOAT, (self.b, self.h))
++        run_layer_test(self, [m, f], utils.ALWAYS_FLOAT, (self.b, self.h))
+ 
++    @unittest.skipIf(device.is_npu(),"NPU does not support group_norm in half")
+     def test_group_norm_is_float(self):
+-        m = nn.GroupNorm(num_groups=4, num_channels=self.c)
+-        run_layer_test(self, [m], ALWAYS_FLOAT, (self.b, self.c, self.h, self.h))
++        m = nn.GroupNorm(num_groups=4, num_channels=self.c).to(self.device)
++        run_layer_test(self, [m], utils.ALWAYS_FLOAT, (self.b, self.c, self.h, self.h))
+ 
+     def test_mse_loss_is_float(self):
+         shape = (self.b, self.h)
+-        target = torch.randn(shape)
+-        mod = nn.MSELoss()
++        target = torch.randn(shape).to(self.device)
++        mod = nn.MSELoss().to(self.device)
+         m = lambda x: mod(x, target)
+         f = ft.partial(F.mse_loss, target=target)
+-        run_layer_test(self, [m], ALWAYS_FLOAT, shape)
++        run_layer_test(self, [m], utils.ALWAYS_FLOAT, shape)
+ 
+     def test_relu_is_match(self):
+-        run_layer_test(self, [nn.ReLU(), F.relu], MATCH_INPUT, (self.b, self.h))
++        run_layer_test(self, [nn.ReLU(), F.relu], utils.MATCH_INPUT, (self.b, self.h))
+ 
+     def test_batch_norm_is_match(self):
+-        m = nn.BatchNorm2d(num_features=self.c)
++        m = nn.BatchNorm2d(num_features=self.c).to(self.device)
+         f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
+                        weight=m.weight, bias=m.bias, training=True)
+-        run_layer_test(self, [m], MATCH_INPUT, (self.b, self.c, self.h, self.h))
++        run_layer_test(self, [m], utils.MATCH_INPUT, (self.b, self.c, self.h, self.h))
+ 
+         # Test forward-only for BN inference
+         m.eval()
+         f = ft.partial(F.batch_norm, running_mean=m.running_mean, running_var=m.running_var,
+                        weight=m.weight, bias=m.bias, training=False)
+-        run_layer_test(self, [m, f], MATCH_INPUT, (self.b, self.c, self.h, self.h),
++        run_layer_test(self, [m, f], utils.MATCH_INPUT, (self.b, self.c, self.h, self.h),
+                             test_backward=False)
+ 
+ class TestBannedMethods(unittest.TestCase):
+     def setUp(self):
+         self.handle = amp.init(enabled=True)
++        self.device = device.CALCULATE_DEVICE
+         common_init(self)
+ 
+     def tearDown(self):
+@@ -81,12 +113,12 @@
+ 
+     def bce_common(self, assertion):
+         shape = (self.b, self.h)
+-        target = torch.rand(shape)
+-        mod = nn.BCELoss()
++        target = torch.rand(shape).to(self.device)
++        mod = nn.BCELoss().to(self.device)
+         m = lambda x: mod(x, target)
+         f = ft.partial(F.binary_cross_entropy, target=target)
+         for fn in [m, f]:
+-            x = torch.rand(shape, dtype=torch.half)
++            x = generate_data(0, 10, shape, np.float16).to(self.device)
+             assertion(fn, x)
+ 
+     def test_bce_raises_by_default(self):
+@@ -96,36 +128,37 @@
+     def test_bce_is_float_with_allow_banned(self):
+         self.handle._deactivate()
+         self.handle = amp.init(enabled=True, allow_banned=True)
+-        assertion = lambda fn, x: self.assertEqual(fn(x).type(), FLOAT)
++        assertion = lambda fn, x: self.assertEqual(fn(x).type(), utils.FLOAT)
+         self.bce_common(assertion)
+ 
+ class TestTensorCasts(unittest.TestCase):
+     def setUp(self):
+         self.handle = amp.init(enabled=True)
++        self.device = device.CALCULATE_DEVICE
+         common_init(self)
+ 
+     def tearDown(self):
+         self.handle._deactivate()
+ 
+     def test_matmul_method_is_half(self):
+-        other = torch.randn(self.h, self.h)
++        other = torch.randn(self.h, self.h).to(self.device)
+         lhs = lambda x: x.matmul(other)
+         rhs = lambda x: other.matmul(x)
+-        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
++        run_layer_test(self, [lhs, rhs], utils.ALWAYS_HALF, (self.h, self.h))
+ 
+     def test_matmul_op_is_half(self):
+-        other = torch.randn(self.h, self.h)
++        other = torch.randn(self.h, self.h).to(self.device)
+         lhs = lambda x: x @ other
+         rhs = lambda x: other @ x
+-        run_layer_test(self, [lhs, rhs], ALWAYS_HALF, (self.h, self.h))
++        run_layer_test(self, [lhs, rhs], utils.ALWAYS_HALF, (self.h, self.h))
+ 
+     def test_pow_method_is_float(self):
+         fn = lambda x: x.pow(2.)
+-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
++        run_layer_test(self, [fn], utils.ALWAYS_FLOAT, (self.b, self.h))
+ 
+     def test_pow_op_is_float(self):
+         fn = lambda x: x ** 2.
+-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
++        run_layer_test(self, [fn], utils.ALWAYS_FLOAT, (self.b, self.h))
+ 
+     def test_cpu_is_float(self):
+         fn = lambda x: x.cpu()
+@@ -135,7 +168,7 @@
+ 
+     def test_sum_is_float(self):
+         fn = lambda x: x.sum()
+-        run_layer_test(self, [fn], ALWAYS_FLOAT, (self.b, self.h))
++        run_layer_test(self, [fn], utils.ALWAYS_FLOAT, (self.b, self.h))
+ 
+     # TODO: maybe more tests on disabled casting?
+ 
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_cache.py apex-npu/tests/L0/run_amp/test_cache.py
+--- apex/tests/L0/run_amp/test_cache.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_cache.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ import functools as ft
+@@ -8,9 +24,16 @@
+ import torch
+ from torch import nn
+ import torch.nn.functional as F
++import numpy as np
++import sys
++sys.path.append('../')
++import device
++import utils
+ 
+ from utils import common_init, HALF, FLOAT,\
+-    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
++    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT,\
++    generate_data
++    
+ 
+ def get_reference_grad(i, w, ops):
+     # Creating new tensors ensures, among other things, that the new tensors are not in the cache.
+@@ -24,7 +47,8 @@
+ class WhitelistModule(torch.nn.Module):
+     def __init__(self, dtype):
+         super(WhitelistModule, self).__init__()
+-        self.weight = torch.nn.Parameter(torch.arange(8*8, device='cuda', dtype=dtype).view(8,8))
++        weight_parameter = torch.from_numpy(np.arange(8*8, dtype=dtype)).view(8,8).to(device.CALCULATE_DEVICE)
++        self.weight = torch.nn.Parameter(weight_parameter)
+ 
+     @staticmethod
+     def ops(input, weight):
+@@ -37,7 +61,8 @@
+ class BlacklistModule(torch.nn.Module):
+     def __init__(self, dtype):
+         super(BlacklistModule, self).__init__()
+-        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
++        weight_parameter = torch.from_numpy(np.arange(2*8, dtype=dtype)).view(2,8).to(device.CALCULATE_DEVICE)
++        self.weight = torch.nn.Parameter(weight_parameter)
+ 
+     @staticmethod
+     def ops(input, weight):
+@@ -50,7 +75,8 @@
+ class PromoteModule(torch.nn.Module):
+     def __init__(self, dtype):
+         super(PromoteModule, self).__init__()
+-        self.weight = torch.nn.Parameter(torch.arange(2*8, device='cuda', dtype=dtype).view(2,8))
++        weight_parameter = torch.from_numpy(np.arange(2*8, dtype=dtype)).view(2,8).to(device.CALCULATE_DEVICE)
++        self.weight = torch.nn.Parameter(weight_parameter)
+ 
+     @staticmethod
+     def ops(input, weight):
+@@ -61,14 +87,14 @@
+ 
+ class TestCache(unittest.TestCase):
+     def setUp(self):
+-        self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
++        self.x = torch.ones((2, 8), dtype=torch.float32).to(device.CALCULATE_DEVICE)
+         common_init(self)
+ 
+     def tearDown(self):
+         pass
+ 
+     def train_eval_train_test(self, module, t):
+-        model = module(t).cuda()
++        model = module(t).to(device.CALCULATE_DEVICE)
+         optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+ 
+         _amp_state.allow_incoming_model_not_fp32 = True
+@@ -91,10 +117,10 @@
+         
+             # Currently there's no difference in the allclose calls, so no need for branching,
+             # but I'm keeping this in case we want different tolerances for fp16 and fp32 checks. 
+-            if model.weight.grad.type() == "torch.cuda.HalfTensor":
+-                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
+-            elif model.weight.grad.type() == "torch.cuda.FloatTensor":
+-                self.assertTrue(torch.allclose(model.weight.grad.float(), reference_grad))
++            if model.weight.grad.type() == utils.HALF:
++                self.assertTrue(torch.allclose(model.weight.grad.float().to('cpu'), reference_grad.to('cpu')))
++            elif model.weight.grad.type() == utils.FLOAT:
++                self.assertTrue(torch.allclose(model.weight.grad.float().to('cpu'), reference_grad.to('cpu')))
+             else:
+                 raise RuntimeError("model.weight.grad.type = {}".format(model.weight.grad.type()))
+ 
+@@ -115,22 +141,25 @@
+     # I could easily have these as a set of for loops in a single test,
+     # instead of going for granularity.
+     def test_whitelist_module_fp16_weight(self):
+-        self.train_eval_train_test(WhitelistModule, torch.float16)
++        self.train_eval_train_test(WhitelistModule, np.float16)
++
+ 
+     def test_whitelist_module_fp32_weight(self):
+-        self.train_eval_train_test(WhitelistModule, torch.float32)
++        self.train_eval_train_test(WhitelistModule, np.float32)
++
+ 
+     def test_blacklist_module_fp16_weight(self):
+-        self.train_eval_train_test(BlacklistModule, torch.float16)
++        self.train_eval_train_test(BlacklistModule, np.float16)
++
+ 
+     def test_blacklist_module_fp32_weight(self):
+-        self.train_eval_train_test(BlacklistModule, torch.float32)
++        self.train_eval_train_test(BlacklistModule, np.float32)
+ 
+     def test_promote_module_fp16_weight(self):
+-        self.train_eval_train_test(PromoteModule, torch.float16)
++        self.train_eval_train_test(PromoteModule, np.float16)
+ 
+     def test_promote_module_fp32_weight(self):
+-        self.train_eval_train_test(PromoteModule, torch.float32)
++        self.train_eval_train_test(PromoteModule, np.float32)
+ 
+ 
+ if __name__ == '__main__':
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_checkpointing.py apex-npu/tests/L0/run_amp/test_checkpointing.py
+--- apex/tests/L0/run_amp/test_checkpointing.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_checkpointing.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ import torch
+@@ -7,9 +23,8 @@
+ 
+ from apex import amp
+ 
+-
+ from utils import common_init, FLOAT
+-
++import utils
+ 
+ class MyModel(torch.nn.Module):
+     def __init__(self):
+@@ -40,7 +55,7 @@
+             if 'num_batches_tracked' in key:
+                 continue
+             param = state_dict[key]
+-            self.assertEqual(param.type(), FLOAT,
++            self.assertEqual(param.type(), utils.FLOAT,
+                              'Parameter in state_dict not FLOAT')
+ 
+     def train_step(self, model, optimizer, data, loss_ids):
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_larc.py apex-npu/tests/L0/run_amp/test_larc.py
+--- apex/tests/L0/run_amp/test_larc.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_larc.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,5 +1,5 @@
+ import unittest
+-
++import sys
+ import torch
+ from torch import nn
+ from torch.nn import Parameter
+@@ -8,12 +8,14 @@
+ from apex.parallel.LARC import LARC
+ from utils import common_init
+ 
++sys.path.append('../')
++import device
+ 
+ class MyModel(torch.nn.Module):
+     def __init__(self, unique):
+         super(MyModel, self).__init__()
+         self.weight0 = Parameter(
+-            unique + torch.arange(2, device="cuda", dtype=torch.float32)
++            unique + torch.arange(2, device=device.CALCULATE_DEVICE, dtype=torch.float32)
+         )
+ 
+     def forward(self, input):
+@@ -22,7 +24,7 @@
+ 
+ class TestLARC(unittest.TestCase):
+     def setUp(self):
+-        self.x = torch.ones((2), device="cuda", dtype=torch.float32)
++        self.x = torch.ones((2), device=device.CALCULATE_DEVICE, dtype=torch.float32)
+         common_init(self)
+ 
+     def tearDown(self):
+@@ -39,7 +41,7 @@
+             )
+ 
+             model, optimizer = amp.initialize(
+-                model, optimizer, opt_level=opt_level, verbosity=0
++                model, optimizer, opt_level=opt_level, loss_scale=1024, verbosity=0
+             )
+ 
+             optimizer.zero_grad()
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_promotion.py apex-npu/tests/L0/run_amp/test_promotion.py
+--- apex/tests/L0/run_amp/test_promotion.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_promotion.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ import itertools as it
+@@ -7,11 +23,17 @@
+ from torch import nn
+ import torch.nn.functional as F
+ 
+-from utils import common_init, HALF, FLOAT, DTYPES
++from utils import common_init, HALF, FLOAT, DTYPES,\
++    generate_data
++import utils
++import sys
++sys.path.append('../')
++import device
+ 
+ class TestPromotion(unittest.TestCase):
+     def setUp(self):
+         self.handle = amp.init(enabled=True)
++        self.device = device.CALCULATE_DEVICE
+         common_init(self)
+ 
+     def tearDown(self):
+@@ -20,12 +42,13 @@
+     def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
+         type_pairs = it.product(DTYPES, DTYPES)
+         for fn, (xtype, ytype) in it.product(fns, type_pairs):
+-            x = torch.randn(input_shape, dtype=xtype).requires_grad_()
++            x = generate_data(0, 10, input_shape, xtype).requires_grad_()
+             x_leaf = x
+             if x_inplace:
+                 # We need a non-leaf to call in place on
+                 x = x.clone()
+-            y = torch.randn(input_shape, dtype=ytype)
++            y = generate_data(0, 10, input_shape, dtype=ytype).to(self.device)
++            x = x.to(self.device)
+             out = fn(x, y)
+             if x_inplace:
+                 # In place: always match xtype
+@@ -33,9 +56,9 @@
+             else:
+                 # Out of place: match widest type
+                 if xtype == torch.float or ytype == torch.float:
+-                    self.assertEqual(out.type(), FLOAT)
++                    self.assertEqual(out.type(), utils.FLOAT)
+                 else:
+-                    self.assertEqual(out.type(), HALF)
++                    self.assertEqual(out.type(), utils.HALF)
+             out.float().sum().backward()
+             self.assertEqual(x_leaf.grad.dtype, xtype)
+ 
+@@ -51,19 +74,19 @@
+ 
+     def test_cat_matches_widest(self):
+         shape = self.b
+-        ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)]
+-        x_float = torch.randn(shape)
++        ys = [generate_data(0, 10, shape, dtype=torch.half).to(self.device) for _ in range(5)]
++        x_float = generate_data(0, 10, shape, dtype=torch.float).to(self.device)
+         out = torch.cat(ys + [x_float])
+-        self.assertEqual(out.type(), FLOAT)
+-        x_half = torch.randn(shape, dtype=torch.half)
++        self.assertEqual(out.type(), utils.FLOAT)
++        x_half = generate_data(0, 10, shape, dtype=torch.half).to(self.device)
+         out = torch.cat(ys + [x_half])
+-        self.assertEqual(out.type(), HALF)
++        self.assertEqual(out.type(), utils.HALF)
+ 
+     def test_inplace_exp_is_error_for_half(self):
+-        xs = torch.randn(self.b)
++        xs = generate_data(0, 10, self.b, dtype=torch.float).to(self.device)
+         xs.exp_()
+-        self.assertEqual(xs.type(), FLOAT)
+-        xs = torch.randn(self.b, dtype=torch.half)
++        self.assertEqual(xs.type(), utils.FLOAT)
++        xs = generate_data(0, 10, self.b, dtype=torch.half).to(self.device)
+         with self.assertRaises(NotImplementedError):
+             xs.exp_()
+ 
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_rnn.py apex-npu/tests/L0/run_amp/test_rnn.py
+--- apex/tests/L0/run_amp/test_rnn.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/test_rnn.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ 
+ from apex import amp
+@@ -5,7 +21,8 @@
+ import torch
+ from torch import nn
+ 
+-from utils import common_init, HALF
++from utils import common_init
++import utils
+ 
+ class TestRnnCells(unittest.TestCase):
+     def setUp(self):
+@@ -34,7 +51,7 @@
+                     output = hidden
+                 outputs.append(output)
+             for y in outputs:
+-                self.assertEqual(y.type(), HALF)
++                self.assertEqual(y.type(), utils.HALF)
+             outputs[-1].float().sum().backward()
+             for i, x in enumerate(xs):
+                 self.assertEqual(x.grad.dtype, x.dtype)
+@@ -69,7 +86,7 @@
+             else:
+                 hidden = hidden_fn()
+             output, _ = rnn(x, hidden)
+-            self.assertEqual(output.type(), HALF)
++            self.assertEqual(output.type(), utils.HALF)
+             output[-1, :, :].float().sum().backward()
+             self.assertEqual(x.grad.dtype, x.dtype)
+ 
+@@ -108,7 +125,7 @@
+             torch.set_default_tensor_type(torch.cuda.FloatTensor)
+             hidden = torch.zeros((num_layers, self.b, self.h), dtype=typ)
+             output, _ = rnn(packed_seq, hidden)
+-            self.assertEqual(output.data.type(), HALF)
++            self.assertEqual(output.data.type(), utils.HALF)
+             output.data.float().sum().backward()
+             self.assertEqual(x.grad.dtype, x.dtype)
+ 
+diff -Nur '--exclude=.git' apex/tests/L0/run_amp/utils.py apex-npu/tests/L0/run_amp/utils.py
+--- apex/tests/L0/run_amp/utils.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_amp/utils.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,7 +1,28 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import torch
++import numpy as np
++
++import sys
++sys.path.append('../')
++import device
+ 
+-HALF = 'torch.cuda.HalfTensor'
+-FLOAT = 'torch.cuda.FloatTensor'
++HALF = 'torch.npu.HalfTensor'
++FLOAT = 'torch.npu.FloatTensor'
+ 
+ DTYPES = [torch.half, torch.float]
+ 
+@@ -18,4 +39,28 @@
+     test_case.c = 16
+     test_case.k = 3
+     test_case.t = 10
+-    torch.set_default_tensor_type(torch.cuda.FloatTensor)
++    global HALF, FLOAT, DTYPES, ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
++    if device.is_npu():
++        HALF = 'torch.npu.HalfTensor'
++        FLOAT = 'torch.npu.FloatTensor'
++        torch.set_default_tensor_type(torch.FloatTensor)
++    else:
++        HALF = 'torch.cuda.HalfTensor'
++        FLOAT = 'torch.cuda.FloatTensor'
++        torch.set_default_tensor_type(torch.cuda.FloatTensor)
++
++    ALWAYS_HALF = {torch.float: HALF,
++                   torch.half: HALF}
++    ALWAYS_FLOAT = {torch.float: FLOAT,
++                    torch.half: FLOAT}
++    MATCH_INPUT = {torch.float: FLOAT,
++                   torch.half: HALF}
++
++def generate_data(min, max, shape, dtype):
++    if dtype == torch.float32:
++        dtype = np.float32
++    if dtype == torch.float16:
++        dtype = np.float16
++    input1 = np.random.uniform(min, max, shape).astype(dtype)
++    npu_input1 = torch.from_numpy(input1)
++    return npu_input1
+\ No newline at end of file
+diff -Nur '--exclude=.git' apex/tests/L0/run_test.py apex-npu/tests/L0/run_test.py
+--- apex/tests/L0/run_test.py	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L0/run_test.py	2021-06-17 07:10:45.397712131 +0000
+@@ -1,20 +1,71 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import unittest
+ import sys
+-
+-test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
++import device
++import torch
++import argparse
+ 
+ runner = unittest.TextTestRunner(verbosity=2)
+-
+ errcode = 0
+ 
+-for test_dir in test_dirs:
+-    suite = unittest.TestLoader().discover(test_dir)
+-
+-    print("\nExecuting tests from " + test_dir)
++parser = argparse.ArgumentParser()
++parser.add_argument('--npu',
++                default=0,
++                type=int,
++                help='NPU id to use.')
++args = parser.parse_args()
++
++device.CALCULATE_DEVICE = "npu:{}".format(args.npu)
++torch.npu.set_device(device.CALCULATE_DEVICE)
++
++if device.is_npu():
++    sys.path.append('./run_amp')
++    sys.path.append('../../apex/contrib/test/')
++    from test_basic_casts import TestBannedMethods, TestTensorCasts, TestBasicCasts
++    from test_cache import TestCache
++    from test_promotion import TestPromotion
++    from test_larc import TestLARC
++    from test_combine_tensors import TestCombineTensors
++    test_dirs = ["run_amp"]
++    suite=unittest.TestSuite()
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestBannedMethods))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestTensorCasts))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestBasicCasts))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestCache))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestPromotion))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestLARC))
++    suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestCombineTensors))
+ 
+     result = runner.run(suite)
+-
+     if not result.wasSuccessful():
+         errcode = 1
++    sys.exit(errcode)
++else:
++    test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
++
++    for test_dir in test_dirs:
++        suite = unittest.TestLoader().discover(test_dir)
++
++        print("\nExecuting tests from " + test_dir)
++
++        result = runner.run(suite)
++
++        if not result.wasSuccessful():
++            errcode = 1
+ 
+-sys.exit(errcode)
++    sys.exit(errcode)
+diff -Nur '--exclude=.git' apex/tests/L1/cross_product/run.sh apex-npu/tests/L1/cross_product/run.sh
+--- apex/tests/L1/cross_product/run.sh	2021-04-12 04:03:22.000000000 +0000
++++ apex-npu/tests/L1/cross_product/run.sh	2021-06-17 07:10:45.401712162 +0000
+@@ -3,4 +3,5 @@
+ # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
+ # DATADIR="/opt/home/apex/examples/imagenet/"
+ cp ../common/* .
+-bash run_test.sh single_gpu $1
++# bash run_test.sh single_gpu $1
++bash run_test_npu.sh single_npu $1 $2
diff --git a/src/apex/contrib/combine_tensors/__init__.py b/src/apex/contrib/combine_tensors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..576eaf06c925876f9487e2d9f8b017c41b2e5c41
--- /dev/null
+++ b/src/apex/contrib/combine_tensors/__init__.py
@@ -0,0 +1 @@
+from .combine_tensors import combine_npu, get_part_combined_tensor, is_combined_tensor_valid
diff --git a/src/apex/contrib/combine_tensors/combine_tensors.py b/src/apex/contrib/combine_tensors/combine_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..68743abcaae4f84af4faaeaa34dda8bb97ef8f6d
--- /dev/null
+++ b/src/apex/contrib/combine_tensors/combine_tensors.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from change_data_ptr import change_data_ptr
+
+def combine_npu(list_of_tensor, require_copy_value = True):
+    total_numel = 0
+    for tensor in list_of_tensor:
+        total_numel += tensor.storage().size()
+
+    if total_numel == 0:
+        return None
+
+    dtype = list_of_tensor[0].dtype
+    combined_tensor = torch.zeros(total_numel, dtype=dtype).npu()
+
+    idx = 0
+    if require_copy_value:
+        for tensor in list_of_tensor:
+            temp = tensor.clone()
+            change_data_ptr(tensor, combined_tensor, idx)
+            tensor.copy_(temp)
+            idx += tensor.storage().size()
+    else:
+        for tensor in list_of_tensor:
+            change_data_ptr(tensor, combined_tensor, idx)
+            idx += tensor.storage().size()
+    return combined_tensor
+
+def get_part_combined_tensor(combined_tensor, index, size):
+    if combined_tensor is None or size == 0:
+        return None
+    
+    part_tensor = torch.zeros(size, dtype=combined_tensor.dtype).npu()
+    change_data_ptr(part_tensor, combined_tensor, index)
+    return part_tensor
+
+def is_combined_tensor_valid(combined_tensor, list_of_tensor):
+    if combined_tensor is None:
+        return False
+
+    combined_tensor_start_addr = combined_tensor.data_ptr()
+    combined_tensor_end_addr = combined_tensor_start_addr + \
+                               combined_tensor.storage().size() * combined_tensor.element_size()
+    
+    for tensor in list_of_tensor:
+        if tensor is None or \
+            tensor.data_ptr() < combined_tensor_start_addr or \
+            tensor.data_ptr() >= combined_tensor_end_addr:
+            return False
+    return True
diff --git a/src/apex/contrib/test/test_combine_tensors.py b/src/apex/contrib/test/test_combine_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..51854703b3f0950a954bd4dd1b1f72edac20dfab
--- /dev/null
+++ b/src/apex/contrib/test/test_combine_tensors.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2021, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import argparse
+import functools as ft
+import itertools as it
+import sys
+from apex import amp
+import torch
+from apex.contrib.combine_tensors import combine_npu
+sys.path.append('../../../tests/L0')
+import device
+
+x_before_combine_des = 'x before combine: '
+y_before_combine_des = 'y before combine: '
+z_before_combine_des = 'z before combine: '
+x_after_combine_des = 'x after combine: '
+y_after_combine_des = 'y after combine: '
+z_after_combine_des = 'z after combine: '
+combine_tensor_des = 'combine tensor: '
+
+def print_tensor_phy_info(des, tensor):
+    print(des, tensor.data_ptr(), tensor.size(), tensor.device)
+
+class TestCombineTensors(unittest.TestCase):
+
+    def setUp(self):
+        self.device = device.CALCULATE_DEVICE
+
+    def basic_functionality(self, dtype = torch.float32):
+        x = torch.zeros((2,2,2,2), device = self.device, dtype = dtype)
+        y = torch.zeros((4,4), device = self.device, dtype = dtype)
+        z = torch.zeros((3,3,3), device = self.device, dtype = dtype)
+
+        print_tensor_phy_info(x_before_combine_des, x)
+        print_tensor_phy_info(y_before_combine_des, y)
+        print_tensor_phy_info(z_before_combine_des, z)
+
+        lst = [x, y, z]
+
+        combine_tensor = combine_npu(lst)
+        print()
+        print_tensor_phy_info(combine_tensor_des, combine_tensor)
+        print_tensor_phy_info(x_after_combine_des, x)
+        print_tensor_phy_info(y_after_combine_des, y)
+        print_tensor_phy_info(z_after_combine_des, z)
+
+        # test if combine_tensor is contiguous, and x,y,z are will moved into the combine_tensor.
+        self.assertEqual(True, combine_tensor.is_contiguous())
+        self.assertEqual(combine_tensor.data_ptr(), x.data_ptr())
+        self.assertEqual(x.data_ptr() + x.storage().size() * x.element_size(), y.data_ptr())
+        self.assertEqual(y.data_ptr() + y.storage().size() * y.element_size(), z.data_ptr())
+        self.assertEqual(combine_tensor.storage().size(), x.storage().size() + y.storage().size() + z.storage().size())
+
+    def test_basic_fp32(self):
+        print('----------------------test basic functionality of fp32------------------------')
+        self.basic_functionality(dtype=torch.float32)
+
+    def test_large_functionality(self):
+        print('----------------------test functionality with large tensors------------------------')
+        x = torch.zeros((200,20,200,20), device = self.device)
+        y = torch.zeros((4000,4000), device = self.device)
+        z = torch.zeros((300,300,300), device = self.device)
+
+        print_tensor_phy_info(x_before_combine_des, x)
+        print_tensor_phy_info(y_before_combine_des, y)
+        print_tensor_phy_info(z_before_combine_des, z)
+
+        lst = [x, y, z]
+
+        combine_tensor = combine_npu(lst)
+        print()
+        print_tensor_phy_info(combine_tensor_des, combine_tensor)
+        print_tensor_phy_info(x_after_combine_des, x)
+        print_tensor_phy_info(y_after_combine_des, y)
+        print_tensor_phy_info(z_after_combine_des, z)
+
+        # test for tensors with very large sizes.
+        self.assertEqual(True, combine_tensor.is_contiguous())
+        self.assertEqual(combine_tensor.data_ptr(), x.data_ptr())
+        self.assertEqual(x.data_ptr() + x.storage().size() * x.element_size(), y.data_ptr())
+        self.assertEqual(y.data_ptr() + y.storage().size() * y.element_size(), z.data_ptr())
+        self.assertEqual(combine_tensor.storage().size(), x.storage().size() + y.storage().size() + z.storage().size())
+
+    def test_computation(self):
+        print('----------------------test computation------------------------')
+        x = torch.zeros((2, 2, 2, 2), device=self.device)
+        y = torch.zeros((4, 4), device=self.device)
+        z = torch.zeros((3, 3, 3), device=self.device)
+
+        print_tensor_phy_info(x_before_combine_des, x)
+        print_tensor_phy_info(y_before_combine_des, y)
+        print_tensor_phy_info(z_before_combine_des, z)
+
+        lst = [x, y, z]
+
+        combine_tensor = combine_npu(lst)
+
+        print()
+        print_tensor_phy_info(combine_tensor_des, combine_tensor)
+        print_tensor_phy_info(x_after_combine_des, x)
+        print_tensor_phy_info(y_after_combine_des, y)
+        print_tensor_phy_info(z_after_combine_des, z)
+
+        combine_tensor += 2
+
+        self.assertEqual(32, x.sum())
+        self.assertEqual(32, y.sum())
+        self.assertEqual(54, z.sum())
+
+        for tensor in lst:
+            tensor.mul_(2)
+
+        self.assertEqual(236, combine_tensor.sum())
+        self.assertEqual(combine_tensor.sum(), x.sum() + y.sum() + z.sum())
+
+    @unittest.skip("not stable test")
+    def test_storage_reuse_and_memory_release(self):
+        print('----------------------test storage reuse and memory release------------------------')
+        x = torch.zeros((2, 2, 2, 2), device=self.device)
+        y = torch.zeros((4, 4), device=self.device)
+
+        print_tensor_phy_info(x_before_combine_des, x)
+        print_tensor_phy_info(y_before_combine_des, y)
+
+        store_x = x.data_ptr()
+        store_y = y.data_ptr()
+
+        lst = [x, y]
+
+        combine_tensor = combine_npu(lst)
+        print()
+        print_tensor_phy_info(combine_tensor_des, combine_tensor)
+        print_tensor_phy_info(x_after_combine_des, x)
+        print_tensor_phy_info(y_after_combine_des, y)
+
+        ### test for storage reuse ###
+        c = torch.zeros((2, 2, 2, 2), device=self.device)
+        d = torch.zeros((4, 4), device=self.device)
+
+        print()
+        print('new tensor c: ', c.data_ptr(), c.size(), c.device)
+        print('new tensor d: ', d.data_ptr(), d.size(), d.device)
+
+        self.assertEqual(store_x, c.data_ptr())
+        self.assertEqual(store_y, d.data_ptr())
+
+        ### test for memory release ###
+        store_x_after_combine = x.data_ptr()
+        x = None
+        new_tensor = torch.zeros((2, 2, 2, 2), device=self.device)
+
+        print('new_tensor with size of x: ', new_tensor.data_ptr(), new_tensor.size(), new_tensor.device)
+        self.assertNotEqual(store_x_after_combine, new_tensor.data_ptr())
+
+if __name__ == '__main__':
+    unittest.main(argv=['test_combine_tensors.py'])
diff --git a/src/apex/dump/__init__.py b/src/apex/dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/apex/dump/dump.py b/src/apex/dump/dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee93d51b452bfabfa8be3cc6df4b4e3eccc7e7a2
--- /dev/null
+++ b/src/apex/dump/dump.py
@@ -0,0 +1,115 @@
+import torch
+from torch import nn
+import h5py
+import numpy as np
+import sys
+
+torch.set_printoptions(profile="full")
+
+class HDF5FileOperation():
+    def __init__(self, file_path, is_cover=True):
+        self.file_path = file_path
+        if is_cover:
+            with h5py.File(self.file_path, "w") as f:
+                pass
+        
+    # read HDF5文件
+    def read_data(self):
+        print("######### HDF5 Data storage structure #########")
+        with h5py.File(self.file_path, "r") as f:
+            def print_name(name):
+                print(name)
+                dset=f[name]
+                if isinstance(dset, h5py.Dataset):
+                    print(dset.dtype)
+                    print(dset.shape)
+                    print()
+            f.visit(print_name)
+
+    # write HDF5文件
+    def write_data(self, groupName, datasetName, data_dump):
+        index = -1
+        if type(data_dump) == tuple:
+            for index, dd in enumerate(data_dump):
+                self._write_dataset(groupName, datasetName, dd, index)
+        else:
+            self._write_dataset(groupName, datasetName, data_dump, index)
+        print("write end.")
+
+    def _write_dataset(self, groupName, datasetName, dataTensor, index):
+        with h5py.File(self.file_path, "a") as f:
+            grp = f[groupName]
+            if dataTensor is not None:
+                data = dataTensor.cpu().data.numpy()
+                if index == -1:
+                    print("writetoh5py create group:", groupName, "-", datasetName, "[shapes=", dataTensor.shape, "]")
+                    grp.create_dataset(datasetName, data=data)
+                else:
+                    print("writetoh5py create group tuple:", groupName, "-", datasetName+str(index), "[shapes=", dataTensor.shape, "]")
+                    grp.create_dataset(datasetName+str(index), data=data)
+            else:
+                if index == -1:
+                    print("writetoh5py create group:", groupName, "-", datasetName, "[shapes=None]")
+                    grp.create_dataset(datasetName, shape=(0,), dtype='f', data=None)
+                else:
+                    print("writetoh5py create group tuple:", groupName, "-", datasetName+str(index), "[shapes=None]")
+                    grp.create_dataset(datasetName+str(index), shape=(0,), dtype='f', data=None)
+
+class HookerAndDumper():
+    def __init__(self, file_path=""):
+        self.handles = []
+        self.file_path = file_path
+        self.G_WRITE_POINT_COUNT = 0
+
+    def _create_group(self, module):
+        print("modules:", type(module)) # 用于区分模块
+        num_id_str = "%04d" % self.G_WRITE_POINT_COUNT
+        self.G_WRITE_POINT_COUNT += 1
+        group_name = num_id_str + "_" + module._get_name()
+        with h5py.File(self.file_path, "a") as f:
+            f.create_group(group_name)
+        return group_name
+
+    # dump前向数据 
+    def forward_hook_fn(self, module, input, output):
+        print("######## forward_hook_fn ########")
+        group_name = self._create_group(module)
+        self.hfo.write_data(group_name, "feat_input", input)
+        self.hfo.write_data(group_name, "feat_output", output)
+
+    # dump 后向数据
+    def backward_hook_fn(self, module, grad_input, grad_output):
+        print("######## backward_hook_fn ########")
+        group_name = self._create_group(module)
+        self.hfo.write_data(group_name, "grad_output", grad_output)
+        self.hfo.write_data(group_name, "grad_input", grad_input)
+        
+    # epoch 和 step用来标志文件名称
+    def register_hooks(self, model, epoch, step):
+        # write HDF5文件
+        self.file_path = self.file_path + str(epoch) + "-epoch" + "_" + str(step) + "-step" +"_dump.hdf5"
+        self.hfo = HDF5FileOperation(self.file_path) 
+        # 遍历所有 module，注册 forward hook 和 backward hook
+        print("model:", type(model))
+        self.model = model
+        modules = list(self.model.named_children())
+        print("register children model:")
+        for name, module in modules:
+            print("children_name:", name)
+            forward_handle = module.register_forward_hook(self.forward_hook_fn)
+            self.handles.append(forward_handle)
+            backward_handle = module.register_backward_hook(self.backward_hook_fn)
+            self.handles.append(backward_handle)
+        print("register hook ok.")
+
+    # remove 所有的handle
+    def remove_hooks(self):
+        for handle in self.handles:
+            handle.remove()
+        print("######## write path:", self.file_path, " ########")
+
+    # read HDF5文件
+    def read_h5py_data(self):
+        self.hfor = HDF5FileOperation(file_path=self.file_path, is_cover=False)
+        self.hfor.read_data()
+
diff --git "a/src/apex/dump/dump\350\257\264\346\230\216.md" "b/src/apex/dump/dump\350\257\264\346\230\216.md"
new file mode 100644
index 0000000000000000000000000000000000000000..2f69604e1ac474bd6cacbcaa3ad5945542848712
--- /dev/null
+++ "b/src/apex/dump/dump\350\257\264\346\230\216.md"
@@ -0,0 +1,239 @@
+# Pytorch module dump功能
+
+## 一、实现原理
+
+### 功能
+
+能dump出规定module epoch和step的每层数据，Dump完成后，可以 remove hook的handle。
+
+依赖Pytorch Hook机制，采用Pytorch Module的forward hook和backward hook分别dump出每个module的前向和反向输出。
+
+### 存放位置
+
+该相关文件放在[Ascend](https://gitee.com/ascend) 社区apex仓库下的apex/dump文件夹。
+
+### 安装包依赖
+
+HDF5支持安装
+
+```
+sudo apt-get install libhdf5-dev  # 安装头文件依赖
+sudo pip install h5py   # 安装h5py
+```
+
+apex包安装
+
+```
+pip install --upgrade apex-*.whl
+```
+
+### Forward hook
+
+正向数据输出每个在module层：
+
+```python
+# 定义 forward hook function
+def hook_fn_forward(module, input, output):
+    print(module) # 用于区分模块
+    print('input', input) # 首先打印出来
+    print('output', output)
+
+model = Model()
+modules = model.named_children() # 获取子模块
+for name, module in modules:
+    module.register_forward_hook(hook_fn_forward) #子模块注册
+```
+
+### Backward hook
+
+反向梯度数据输出每个层：
+
+```python
+def hook_fn_backward(module, grad_input, grad_output):
+    print(module) # 为了区分模块
+    print('grad_output', grad_output) # 为了符合反向传播的顺序，我们先打印 grad_output
+    print('grad_input', grad_input)  # 再打印 grad_input
+
+model = Model()
+modules = model.named_children()
+for name, module in modules:
+    module.register_backward_hook(hook_fn_backward)
+```
+
+按照输出和输入， 顺序序列化每个tensor。
+
+### 接口定义
+
+```python
+import torch
+from torch import nn
+import h5py
+
+class HookerAndDumper():
+def  __init__(self, file_path = "", is_screen_display=False):
+    # 参数初始化
+def forward_hook_fn(self, module, input, output):
+    # dump前向数据
+def backward_hook_fn(self, module, grad_input, grad_output):
+    # dump 后向数据
+    # epoch 和 step用来标志文件名称
+def register_hooks(self, epoch, step):
+    # 子模块注册
+def remove_hooks(self):
+    # remove 所有的handle
+def read_h5py_data(self):
+    # 读取存储的hdf5文件的存储结构 group dataset
+    
+```
+
+## 二、功能调用说明
+
+以resnet50为例进行说明
+
+### 导入模块dump.py
+
+从apex模块中导入
+
+```
+import apex.dump.dump as dump
+或者
+from apex.dump.dump import HookerAndDumper
+```
+
+注：需要安装h5py包，pip install h5py
+
+### ForwardAndBackwardDumper类初始化
+
+```
+fwb = dump.HookerAndDumper(file_path="···/": str) -> HookerAndDumper
+```
+
+参数说明：
+
+file_path：设置存储的文件路径，最后以”/”结束，默认存储在当前路径下。
+
+### hook注册
+
+```
+fwb.register_hooks(model=model: module, epoch=0: int, step=3: int) -> handle
+```
+
+参数说明：
+
+model：计算的model，会对每个子模块进行注册。
+
+epoch：需要dump的epoch位置，需要将该注册函数放在相应epoch位置，该函数会将epoch信息添加到生成的文件名当中。
+
+step：同上，需要dump的step位置，请将该注册函数放在相应step位置，该函数会将step信息添加到生成的文件名当中。
+
+其间，会有子模块注册和Dump数据到hdf5文件的信息在屏幕打印。
+
+### 关闭dump功能
+
+```
+fwb.remove_hooks()
+```
+
+功能说明：
+
+关闭hook 的handle，请将该函数放在对应epoch，step结束之后。
+
+显示关闭hook和文件成功信息，反馈dump过程中写入的文件路径与文件名信息。
+
+### 读取hdf5文件
+
+```
+fwb.read_h5py_data()
+```
+
+功能说明：
+
+读取文件，在屏幕端显示hdf5文件格式的group和group下dataset信息，可根据显示的信息对文件内容进行访问，详细访问方式在下章节介绍。
+
+## 三、HDF5格式文件读写方法
+
+###  介绍
+
+HDF5是一种存储相同类型数值的大数组的机制，适用于可被层次性组织且数据集需要被元数据标记的数据模型，常用的接口模块为 h5py，便于进行数据的比对。
+
+**- hdf5 files**： 能够存储两类数据对象 dataset 和 group 的容器，其操作类似 **python 标准的文件**操作；File 实例对象本身就是一个组，以 `/` 为名，是遍历文件的入口。
+
+**- dataset(array-like)**： 可类比为 Numpy 数组，每个数据集都有一个名字（name）、形状（shape） 和类型（dtype），支持切片操作。
+
+**- group(folder-like)**： 可以类比为 字典，它是一种像文件夹一样的容器；group 中可以存放 dataset 或者其他的 group，键就是组成员的名称，值就是组成员对象本身(组或者数据集)。
+
+### 安装
+
+```
+pip install h5py
+```
+
+### read
+
+```python
+>>> import h5py
+>>> f = h5py.File('mytestfile.hdf5', 'r')
+>>> dset = f['mydataset'] # 通过键值获得dataset或group
+>>> dset.shape
+>>> dset.dtype
+```
+
+```python
+>>> import torch
+>>> import h5py
+>>> torch.set_printoptions(profile="full")
+>>> f = h5py.File("0-epoch_3-step_dump.hdf5")
+>>> data = f['018_BatchNorm2d/grad_input0']
+>>> print(data)
+>>> print(data.value)
+>>> print(data[:])
+```
+
+### Write
+
+```python
+>>> import h5py
+>>> import numpy as np
+>>> f = h5py.File("mytestfile.hdf5", "w")
+>>> dset = f.create_dataset("mydataset", (100,), dtype='i') # 直接创建dataset
+
+>>> arr = np.arange(100)
+>>> dset = f.create_dataset("init", data=arr) # 直接传入data，无需设置dtype和shape类型，会根据arr类型自动设置
+
+>>> f2 = h5py.File('mydataset2.hdf5', 'a')
+>>> grp = f2.create_group("subgroup")
+>>> dset2 = grp.create_dataset("another_dataset", (50,), dtype='f') # 在group上创建dataset
+```
+
+### 遍历文件
+
+**f.Key()**
+
+```python
+import h5py
+with h5py.File('cat_dog.h5', "r") as f:
+    for key in f.keys():
+    	# 若是group对象，则没有value属性的,会包异常。
+    	print(f[key], key, f[key].name, f[key].value) 
+		# f[key] means a dataset or a group object. 
+		# f[key].value visits dataset' value, except group object.
+        print(f[key], key, f[key].name) 
+
+```
+
+**f.visit()**
+
+```python
+with h5py.File(self.file_path, "r") as f:
+        def print_name(name):
+            print(name)
+            dset=f[name]
+            if isinstance(dset, h5py.Dataset): # 判断为Dataset成员类型，不是Group
+            	# 可对获取的dataset数据进行相应操作
+				print(dset.dtype)  # dset数据类型
+                print(dset.shape)  # dset shape格式
+                print(dset[:]) # 显示dset数据信息，也可用dset.value
+        f.visit(print_name) # 遍历hfd5格式信息传导到输入的函数中作为参数name
+
+```
+
diff --git a/src/apex/optimizers/lamb.py b/src/apex/optimizers/lamb.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a98fefb9a0b374218656feaaeae27c25ec70e3
--- /dev/null
+++ b/src/apex/optimizers/lamb.py
@@ -0,0 +1,128 @@
+# This is based on pytorch-lamb (https://github.com/cybertronai/pytorch-lamb).
+#
+# Copyright (c) 2021, Huawei Technologies. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lamb optimizer."""
+
+import collections
+import math
+
+import torch
+from torch.optim import Optimizer
+
+class Lamb(Optimizer):
+    r"""Implements Lamb algorithm.
+
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam (bool, optional): always use trust ratio = 1, which turns this into
+            Adam. Useful for comparison purposes.
+
+    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
+                 weight_decay=0, adam=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay)
+        self.adam = adam
+        super(Lamb, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # m_t
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                # v_t
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+                # Paper v3 does not use debiasing.
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+                # Apply bias to lr to avoid broadcast.
+                step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1
+
+                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
+
+                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
+                if group['weight_decay'] != 0:
+                    adam_step.add_(p.data, alpha=group['weight_decay'])
+
+                adam_norm = adam_step.pow(2).sum().sqrt()
+                if weight_norm == 0 or adam_norm == 0:
+                    trust_ratio = 1
+                else:
+                    trust_ratio = weight_norm / adam_norm
+                state['weight_norm'] = weight_norm
+                state['adam_norm'] = adam_norm
+                state['trust_ratio'] = trust_ratio
+                if self.adam:
+                    trust_ratio = 1
+
+                alpha = -step_size * trust_ratio
+                adam_step.mul_(alpha)
+                p.data.add_(adam_step)
+
+        return loss
diff --git a/src/apex/optimizers/npu_fused_adadelta.py b/src/apex/optimizers/npu_fused_adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb3d4330256a122d0c69184a86010475e142132
--- /dev/null
+++ b/src/apex/optimizers/npu_fused_adadelta.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020, Huawei Technologies.
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim.optimizer import Optimizer
+from collections import defaultdict
+from ..contrib.combine_tensors import combine_npu
+
+class NpuFusedAdadelta(Optimizer):
+    """Implements NpuFusedAdadelta algorithm.
+    Currently NPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``.
+
+    This version of fused ADADELTA implements 1 fusions.
+
+      * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.NpuFusedAdadelta` may be used as a drop-in replacement for ``torch.optim.Adadelta``::
+
+        opt = apex.optimizers.NpuFusedAdadelta(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.NpuFusedAdadelta` should be used with Amp.  Currently, if you wish to use :class:`NpuFusedAdadelta` with Amp,
+    only ``opt_level O2`` can be choosed::
+
+        opt = apex.optimizers.NpuFusedAdadelta(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O2")
+        ...
+        opt.step()
+    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1212.5701
+    """
+
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError("Invalid rho value: {}".format(rho))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        self.is_npu_fused_optimizer = True
+        super(NpuFusedAdadelta, self).__init__(params, defaults)
+
+    def _init_param_state(self, p):
+        state = self.state[p]
+        # State initialization
+        if len(state) == 0:
+            state['step'] = 0
+            state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+        else:
+            square_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            square_avg_tmp.copy_(state['square_avg'])
+            state['square_avg'] = square_avg_tmp
+
+            acc_delta_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            acc_delta_tmp.copy_(state['acc_delta'])
+            state['acc_delta'] = acc_delta_tmp
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        combined_param_states = []
+        for params in group_params_list:
+            step_list = []
+            square_avg_list = []
+            acc_delta_list = []
+
+            for p in params:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError('NpuFusedAdadelta does not support sparse gradients')
+                
+                self._init_param_state(p)
+                state = self.state[p]
+                step_list.append(state['step'])
+                square_avg_list.append(state['square_avg'])
+                acc_delta_list.append(state['acc_delta'])
+            
+            combined_step = 0
+            combined_square_avg = None
+            combined_acc_delta = None
+
+            if len(square_avg_list) > 0:
+                combined_step = step_list[0]
+                combined_square_avg = combine_npu(square_avg_list)
+                combined_acc_delta = combine_npu(acc_delta_list)
+            
+            combined_state = defaultdict(dict)
+            combined_state['step'] = combined_step
+            combined_state['square_avg'] = combined_square_avg
+            combined_state['acc_delta'] = combined_acc_delta
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for group in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError('NpuFusedAdadelta does not support sparse gradients')
+            state_p = self.state[p]
+            state_p['step'] += 1
+
+        rho, eps = group['rho'], group['eps']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state in zip(combined_group_params, 
+                                                                       combined_group_grads, 
+                                                                       combined_group_param_states):
+            if combined_param is None or combined_grad is None:
+                continue
+
+            square_avg, acc_delta = combined_param_state['square_avg'], combined_param_state['acc_delta']
+            combined_param_state['step'] += 1
+
+            if group['weight_decay'] != 0:
+                combined_grad = combined_grad.add(combined_param, alpha=group['weight_decay'])
+
+            square_avg.mul_(rho).addcmul_(combined_grad, combined_grad, value=1 - rho)
+            std = square_avg.add(eps).sqrt_()
+            delta = acc_delta.add(eps).sqrt_().div_(std).mul_(combined_grad)
+            combined_param.add_(delta, alpha=-group['lr'])
+            acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if not hasattr(self, "_amp_stash"):
+            raise RuntimeError('apex.optimizers.NpuFusedAdadelta should be used with AMP.')
+
+        self._check_already_combined_params_and_grads()
+        # combine params and grads first
+        self._combine_params_and_grads_by_group()
+        # then combine param states
+        self._combine_param_states_by_group()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        stash = self._amp_stash
+        for i, group in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
diff --git a/src/apex/optimizers/npu_fused_adam.py b/src/apex/optimizers/npu_fused_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1366102eb8666ce400cb79043e721b3338a9c1
--- /dev/null
+++ b/src/apex/optimizers/npu_fused_adam.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2020, Huawei Technologies.
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from collections import defaultdict
+from ..contrib.combine_tensors import combine_npu
+
+class NpuFusedAdam(Optimizer):
+
+    """Implements Adam algorithm.
+
+    Currently NPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``.
+
+    This version of NPU fused Adam implements 1 fusions.
+
+      * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.NpuFusedAdam` may be used as a drop-in replacement for ``torch.optim.Adam``::
+
+        opt = apex.optimizers.NpuFusedAdam(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.FusedAdam` should be used with Amp.  Currently, if you wish to use :class:`NpuFusedAdam` with Amp,
+    only ``opt_level O2`` can be choosed::
+
+        opt = apex.optimizers.NpuFusedAdam(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O2")
+        ...
+        opt.step()
+
+
+    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam - A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        self.is_npu_fused_optimizer = True
+        super(NpuFusedAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(NpuFusedAdam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def _init_param_state(self, p, amsgrad):
+        state = self.state[p]
+        # State initialization
+        if len(state) == 0:
+            state['step'] = 0
+            # Exponential moving average of gradient values
+            state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            # Exponential moving average of squared gradient values
+            state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            if amsgrad:
+                # Maintains max of all exp. moving avg. of sq. grad. values
+                state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+        else:
+            exp_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_tmp.copy_(state['exp_avg'])
+            state['exp_avg'] = exp_avg_tmp
+
+            exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_sq_tmp.copy_(state['exp_avg_sq'])
+            state['exp_avg_sq'] = exp_avg_sq_tmp
+
+            if amsgrad:
+                max_exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+                max_exp_avg_sq_tmp.copy_(state['max_exp_avg_sq'])
+                state['max_exp_avg_sq'] = max_exp_avg_sq_tmp
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        amsgrad = group['amsgrad']
+
+        combined_param_states = []
+        for params in group_params_list:
+            step_list = []
+            exp_avg_list = []
+            exp_avg_sq_list = []
+            max_exp_avg_sq_list = []
+
+            for p in params:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError('NpuFusedAdam does not support sparse gradients, '
+                                       'please consider SparseAdam instead')
+
+                self._init_param_state(p, amsgrad)
+                state = self.state[p]
+                step_list.append(state['step'])
+                exp_avg_list.append(state['exp_avg'])
+                exp_avg_sq_list.append(state['exp_avg_sq'])
+                if amsgrad:
+                    max_exp_avg_sq_list.append(state['max_exp_avg_sq'])
+            
+            combined_step = 0
+            combined_exp_avg = None
+            combined_exp_avg_sq = None
+            combined_max_exp_avg_sq = None
+
+            if len(exp_avg_list) > 0:
+                combined_step = step_list[0]
+                combined_exp_avg = combine_npu(exp_avg_list)
+                combined_exp_avg_sq = combine_npu(exp_avg_sq_list)
+                combined_max_exp_avg_sq = combine_npu(max_exp_avg_sq_list)
+            
+            combined_state = defaultdict(dict)
+            combined_state['step'] = combined_step
+            combined_state['exp_avg'] = combined_exp_avg
+            combined_state['exp_avg_sq'] = combined_exp_avg_sq
+            combined_state['max_exp_avg_sq'] = combined_max_exp_avg_sq
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for group in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError('NpuFusedAdam does not support sparse gradients, '
+                                    'please consider SparseAdam instead')
+            state_p = self.state[p]
+            state_p['step'] += 1
+
+        amsgrad = group['amsgrad']
+        beta1, beta2 = group['betas']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state in zip(combined_group_params, 
+                                                                       combined_group_grads, 
+                                                                       combined_group_param_states):
+            if combined_param is None or combined_grad is None:
+                continue
+
+            exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq']
+            if amsgrad:
+                max_exp_avg_sq = combined_param_state['max_exp_avg_sq']
+
+            combined_param_state['step'] += 1
+            bias_correction1 = 1 - beta1 ** combined_param_state['step']
+            bias_correction2 = 1 - beta2 ** combined_param_state['step']
+
+            if group['weight_decay'] != 0:
+                combined_grad = combined_grad.add(combined_param, alpha=group['weight_decay'])
+
+            # Decay the first and second moment running average coefficient
+            exp_avg.mul_(beta1).add_(combined_grad, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(combined_grad, combined_grad, value=1 - beta2)
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                # Use the max. for normalizing running avg. of gradient
+                denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+            else:
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+
+            step_size = group['lr'] / bias_correction1
+
+            combined_param.addcdiv_(exp_avg, denom, value=-step_size)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+
+        if not hasattr(self, "_amp_stash"):
+            raise RuntimeError('apex.optimizers.NpuFusedAdam should be used with AMP.')
+
+        self._check_already_combined_params_and_grads()
+        # combine params and grads first
+        self._combine_params_and_grads_by_group()
+        # then combine param states
+        self._combine_param_states_by_group()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        stash = self._amp_stash
+        for i, group in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
diff --git a/src/apex/optimizers/npu_fused_bert_adam.py b/src/apex/optimizers/npu_fused_bert_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a629c42ad310b2fd5a9fd40e3cc3f6131a3dcf
--- /dev/null
+++ b/src/apex/optimizers/npu_fused_bert_adam.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright (c) 2021 Huawei Technologies. All rights reserved.
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch optimization for BERT model."""
+
+import math
+from collections import defaultdict
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from ..contrib.combine_tensors import combine_npu
+
+def warmup_cosine(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+def warmup_constant(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0
+
+def warmup_linear(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return max((x - 1. )/ (warmup - 1.), 0.)
+    
+def warmup_poly(x, warmup=0.002, degree=0.5):
+    if x < warmup:
+        return x/warmup
+    return (1.0 - x)**degree
+
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+    'warmup_poly':warmup_poly,
+}
+
+class NpuFusedBertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix. This is the fused version on NPU
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate. Default: -1
+        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.99, e=1e-6, weight_decay=0.01,
+                 max_grad_norm=-1):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        self.is_npu_fused_optimizer = True
+        self.max_grad_norm = max_grad_norm
+        super(NpuFusedBertAdam, self).__init__(params, defaults)
+
+    def _init_param_state(self, p):
+        state = self.state[p]
+        # state initialization
+        if len(state) == 0:
+            state['step'] = 0
+            # Exponential moving average of gradient values
+            state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            # Exponential moving average of squared gradient values
+            state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+        else:
+            exp_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_tmp.copy_(state['exp_avg'])
+            state['exp_avg'] = exp_avg_tmp
+
+            exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_sq_tmp.copy_(state['exp_avg_sq'])
+            state['exp_avg_sq'] = exp_avg_sq_tmp
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        combined_param_states = []
+        for params in group_params_list:
+            step_list = []
+            exp_avg_list = []
+            exp_avg_sq_list = []
+
+            for p in params:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+
+                self._init_param_state(p)
+                state = self.state[p]
+                step_list.append(state['step'])
+                exp_avg_list.append(state['exp_avg'])
+                exp_avg_sq_list.append(state['exp_avg_sq'])
+
+            combined_step = 0
+            combined_exp_avg = None
+            combined_exp_avg_sq = None
+
+            if len(exp_avg_list) > 0:
+                combined_step = step_list[0]
+                combined_exp_avg = combine_npu(exp_avg_list)
+                combined_exp_avg_sq = combine_npu(exp_avg_sq_list)
+
+            combined_state = defaultdict(dict)
+            combined_state['step'] = combined_step
+            combined_state['exp_avg'] = combined_exp_avg
+            combined_state['exp_avg_sq'] = combined_exp_avg_sq
+
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for group in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+
+        beta1, beta2 = group['b1'], group['b2']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state in zip(combined_group_params,
+                                                                       combined_group_grads,
+                                                                       combined_group_param_states):
+            if combined_param is None or combined_grad is None:
+                continue
+
+            exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq']
+
+            if group['max_grad_norm'] > 0 and self.global_grad_norm != float('inf') and self.global_grad_norm > 1:
+                    combined_grad /= self.global_grad_norm
+
+            exp_avg.mul_(beta1).add_(1 - beta1, combined_grad)
+            exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, combined_grad, combined_grad)
+            update = exp_avg / (exp_avg_sq.sqrt() + group['e'])
+
+            if group['weight_decay'] > 0.0:
+                update += group['weight_decay'] * combined_param.data
+
+            if group['t_total'] != -1:
+                schedule_fct = SCHEDULES[group['schedule']]
+                lr_scheduled = group['lr'] * schedule_fct(combined_param_state['step'] / group['t_total'], group['warmup'])
+            else:
+                lr_scheduled = group['lr']
+
+            update_with_lr = lr_scheduled * update
+            combined_param.data.add_(-update_with_lr)
+            combined_param_state['step'] += 1
+
+    def get_global_grad_norm(self):
+        self.global_grad_norm = 0
+        for i, group in enumerate(self.param_groups):
+            for combined_group_grads in self._amp_stash.combined_grads_indexed_by_group[i]:
+                if combined_group_grads is not None:
+                    self.global_grad_norm += combined_group_grads.pow(2).sum()
+        self.global_grad_norm = self.global_grad_norm.sqrt().item()
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        self._check_already_combined_params_and_grads()
+        self._combine_params_and_grads_by_group()
+        self._combine_param_states_by_group()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        if self.max_grad_norm > 0:
+            self.get_global_grad_norm()
+
+        stash = self._amp_stash
+        for i, group in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
\ No newline at end of file
diff --git a/src/apex/optimizers/npu_fused_lamb.py b/src/apex/optimizers/npu_fused_lamb.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb576846b68dcf0da9e86fc4486e3f2f55b05cd
--- /dev/null
+++ b/src/apex/optimizers/npu_fused_lamb.py
@@ -0,0 +1,309 @@
+# This is based on pytorch-lamb (https://github.com/cybertronai/pytorch-lamb).
+#
+# Copyright (c) 2021, Huawei Technologies. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NpuFusedLamb optimizer."""
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from collections import defaultdict
+from ..contrib.combine_tensors import combine_npu
+
+class NpuFusedLamb(Optimizer):
+    r"""Implements NpuFusedLamb algorithm.
+
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam (bool, optional): always use trust ratio = 1, which turns this into
+            Adam. Useful for comparison purposes.
+        use_global_grad_norm(bool, optional): use global grad norm (default: False)
+
+    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
+                 weight_decay=0, adam=False, use_global_grad_norm=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay)
+        self.adam = adam
+        self.use_global_grad_norm = use_global_grad_norm
+        self.is_npu_fused_optimizer = True
+        self.global_grad_norm = torch.Tensor([1]).to('npu')
+        self.middle_vars_are_combined_by_group = False
+        super(NpuFusedLamb, self).__init__(params, defaults)
+
+    def _init_param_state(self, p):
+        state = self.state[p]
+        # State initialization
+        if len(state) == 0:
+            state['step'] = 0
+            # Exponential moving average of gradient values
+            state['exp_avg'] = torch.zeros_like(p)
+            # Exponential moving average of squared gradient values
+            state['exp_avg_sq'] = torch.zeros_like(p)
+        else:
+            exp_avg_tmp = torch.zeros_like(p)
+            exp_avg_tmp.copy_(state['exp_avg'])
+            state['exp_avg'] = exp_avg_tmp
+
+            exp_avg_sq_tmp = torch.zeros_like(p)
+            exp_avg_sq_tmp.copy_(state['exp_avg_sq'])
+            state['exp_avg_sq'] = exp_avg_sq_tmp
+
+    def _combine_middle_vars(self, group_index):
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        stash.trust_ratio_lists_indexed_by_group[group_index] = []
+        stash.param_pow_lists_indexed_by_group[group_index] = []
+        stash.adam_step_pow_lists_indexed_by_group[group_index] = []
+
+        stash.combined_trust_ratios_indexed_by_group[group_index] = []
+        stash.combined_param_pows_indexed_by_group[group_index] = []
+        stash.combined_adam_step_pows_indexed_by_group[group_index] = []
+
+        for params in group_params_list:
+            trust_ratio_list = []
+            param_pow_list = []
+            adam_step_pow_list = []
+
+            for p in params:
+                trust_ratio_list.append(torch.zeros_like(p))
+                param_pow_list.append(torch.zeros_like(p))
+                adam_step_pow_list.append(torch.zeros_like(p))
+
+            combined_trust_ratio = combine_npu(trust_ratio_list)
+            combined_param_pow = combine_npu(param_pow_list)
+            combined_adam_step_pow = combine_npu(adam_step_pow_list)
+
+            stash.trust_ratio_lists_indexed_by_group[group_index].append(trust_ratio_list)
+            stash.param_pow_lists_indexed_by_group[group_index].append(param_pow_list)
+            stash.adam_step_pow_lists_indexed_by_group[group_index].append(adam_step_pow_list)
+
+            stash.combined_trust_ratios_indexed_by_group[group_index].append(combined_trust_ratio)
+            stash.combined_param_pows_indexed_by_group[group_index].append(combined_param_pow)
+            stash.combined_adam_step_pows_indexed_by_group[group_index].append(combined_adam_step_pow)
+
+    def _combine_middle_vars_by_group(self):
+        stash = self._amp_stash
+        if self.middle_vars_are_combined_by_group:
+            return
+
+        stash.trust_ratio_lists_indexed_by_group = []
+        stash.param_pow_lists_indexed_by_group = []
+        stash.adam_step_pow_lists_indexed_by_group = []
+
+        stash.combined_trust_ratios_indexed_by_group = []
+        stash.combined_param_pows_indexed_by_group = []
+        stash.combined_adam_step_pows_indexed_by_group = []
+
+        for group in self.param_groups:
+            stash.trust_ratio_lists_indexed_by_group.append([])
+            stash.param_pow_lists_indexed_by_group.append([])
+            stash.adam_step_pow_lists_indexed_by_group.append([])
+
+            stash.combined_trust_ratios_indexed_by_group.append([])
+            stash.combined_param_pows_indexed_by_group.append([])
+            stash.combined_adam_step_pows_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_middle_vars(i)
+        self.middle_vars_are_combined_by_group = True
+
+    def _combine_group_param_states(self, group_index):
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        combined_param_states = []
+        for params in group_params_list:
+            step_list = []
+            exp_avg_list = []
+            exp_avg_sq_list = []
+
+            for p in params:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError('NpuFusedLamb does not support sparse gradients, '
+                                       'please consider SparseAdam instead.')
+                
+                self._init_param_state(p)
+                state = self.state[p]
+                step_list.append(state['step'])
+                exp_avg_list.append(state['exp_avg'])
+                exp_avg_sq_list.append(state['exp_avg_sq'])
+            
+            combined_step = 0
+            combined_exp_avg = None
+            combined_exp_avg_sq = None
+
+            if len(exp_avg_list) > 0:
+                combined_step = step_list[0]
+                combined_exp_avg = combine_npu(exp_avg_list)
+                combined_exp_avg_sq = combine_npu(exp_avg_sq_list)
+            
+            combined_state = defaultdict(dict)
+            combined_state['step'] = combined_step
+            combined_state['exp_avg'] = combined_exp_avg
+            combined_state['exp_avg_sq'] = combined_exp_avg_sq
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for group in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _get_global_grad_norm(self):
+        global_norm = 0
+        combined_grads = self.get_optimizer_combined_grads()
+        combined_grad_masks = self.get_optimizer_combined_grad_masks()
+        for combined_grad, combined_grad_mask in zip(combined_grads, combined_grad_masks):
+            if combined_grad is not None:
+                global_norm += combined_grad.pow(2).mul_(combined_grad_mask).sum()
+        global_norm.sqrt_()
+        return global_norm
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError('NpuFusedLamb does not support sparse gradients, '
+                                   'please consider SparseAdam instead.')
+            state_p = self.state[p]
+            state_p['step'] += 1
+        beta1, beta2 = group['betas']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+        trust_ratio_lists = stash.trust_ratio_lists_indexed_by_group[group_index]
+        param_pow_lists = stash.param_pow_lists_indexed_by_group[group_index]
+        adam_step_pow_lists = stash.adam_step_pow_lists_indexed_by_group[group_index]
+        combined_trust_ratios = stash.combined_trust_ratios_indexed_by_group[group_index]
+        combined_param_pows = stash.combined_param_pows_indexed_by_group[group_index]
+        combined_adam_step_pows = stash.combined_adam_step_pows_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state, \
+            trust_ratio_list, param_pow_list, adam_step_pow_list, \
+            combined_trust_ratio, combined_param_pow, \
+            combined_adam_step_pow in zip(combined_group_params,
+                                          combined_group_grads,
+                                          combined_group_param_states,
+                                          trust_ratio_lists,
+                                          param_pow_lists,
+                                          adam_step_pow_lists,
+                                          combined_trust_ratios,
+                                          combined_param_pows,
+                                          combined_adam_step_pows):
+            if combined_param is None or combined_grad is None:
+                continue
+
+            if self.global_grad_norm.item() > 1:
+                combined_grad = combined_grad / self.global_grad_norm
+
+            exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq']
+            combined_param_state['step'] += 1
+
+            # Decay the first and second moment running average coefficient
+            exp_avg.mul_(beta1).add_(combined_grad, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(combined_grad, combined_grad, value=1 - beta2)
+
+            step_size = group['lr']
+
+            adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
+            if group['weight_decay'] != 0:
+                adam_step.add_(combined_param, alpha=group['weight_decay'])
+
+            if self.adam:
+                combined_trust_ratio.fill_(1)
+            else:
+                combined_param_pow.copy_(combined_param.pow(2))
+                combined_adam_step_pow.copy_(adam_step.pow(2))
+
+                for param_pow, adam_step_pow, trust_ratio in zip(param_pow_list, 
+                                                                 adam_step_pow_list, 
+                                                                 trust_ratio_list):
+                    weight_norm = param_pow.sum().sqrt().clamp(0, 10)
+                    adam_norm = adam_step_pow.sum().sqrt()
+                    if weight_norm == 0 or adam_norm == 0:
+                        trust_ratio.fill_(1)
+                    else:
+                        trust_ratio.fill_(weight_norm / adam_norm)
+
+            combined_param.addcmul_(adam_step, combined_trust_ratio, value=-step_size)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+
+        if not hasattr(self, "_amp_stash"):
+            raise RuntimeError('apex.optimizers.NpuFusedLamb should be used with AMP.')
+
+        self._check_already_combined_params_and_grads()
+        # combine params and grads first
+        self._combine_params_and_grads_by_group()
+        # then combine param states
+        self._combine_param_states_by_group()
+        self._combine_middle_vars_by_group()
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        stash = self._amp_stash
+        if self.use_global_grad_norm:
+            self.global_grad_norm = self._get_global_grad_norm()
+        for i, group in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
diff --git a/src/apex/optimizers/npu_fused_sgd.py b/src/apex/optimizers/npu_fused_sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d03fbac62aaf7dcd16b78e7610fa4f80130f9f
--- /dev/null
+++ b/src/apex/optimizers/npu_fused_sgd.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2020, Huawei Technologies.
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim.optimizer import Optimizer, required
+from collections import defaultdict
+from ..contrib.combine_tensors import combine_npu
+
+class NpuFusedSGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Currently NPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``.
+
+    This version of fused SGD implements 1 fusions.
+
+      * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.NpuFusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``::
+
+        opt = apex.optimizers.NpuFusedSGD(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.FusedSGD` should be used with Amp.  Currently, if you wish to use :class:`NpuFusedSGD` with Amp,
+    only ``opt_level O2`` can be choosed::
+
+        opt = apex.optimizers.NpuFusedSGD(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O2")
+        ...
+        opt.step()
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
+                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
+            \end{aligned}
+
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        parameters, gradient, velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
+                p_{t+1} & = p_{t} - v_{t+1}.
+            \end{aligned}
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        self.is_npu_fused_optimizer = True
+        self._momentum_buffer_already_in_state = False
+        super(NpuFusedSGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(NpuFusedSGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    def _init_param_state(self, p, weight_decay):
+        d_p = p.grad
+        state = self.state[p]
+        if 'momentum_buffer' not in state:
+            self._momentum_buffer_already_in_state = False
+            if weight_decay != 0:
+                d_p = d_p.add(p, alpha=weight_decay)
+            state['momentum_buffer'] = torch.clone(d_p).detach()
+        else:
+            temp = torch.clone(d_p).detach()
+            temp.copy_(state['momentum_buffer'])
+            state['momentum_buffer'] = temp
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+        
+        weight_decay = group['weight_decay']
+        momentum = group['momentum']
+
+        combined_param_states = []
+        for params in group_params_list:
+            if momentum == 0:
+                combined_state = defaultdict(dict)
+                combined_state['momentum_buffer'] = None
+                combined_param_states.append(combined_state)
+                continue
+
+            momentum_buffer_list = []
+            for p in params:
+                if p.grad is None:
+                    continue
+
+                self._init_param_state(p, weight_decay)
+                state = self.state[p]
+                momentum_buffer_list.append(state['momentum_buffer'])
+
+            combined_momentum_buffer = None
+            if len(momentum_buffer_list) > 0:
+                combined_momentum_buffer = combine_npu(momentum_buffer_list)
+            
+            combined_state = defaultdict(dict)
+            combined_state['momentum_buffer'] = combined_momentum_buffer
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for group in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, group in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+        weight_decay = group['weight_decay']
+        momentum = group['momentum']
+        dampening = group['dampening']
+        nesterov = group['nesterov']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state in zip(combined_group_params, 
+                                                                       combined_group_grads, 
+                                                                       combined_group_param_states):
+            if combined_param is None or combined_grad is None:
+                continue
+            
+            if weight_decay != 0:
+                combined_grad = combined_grad.add(combined_param, alpha=weight_decay)
+            if momentum != 0:
+                buf = combined_param_state['momentum_buffer']
+                if self._momentum_buffer_already_in_state:
+                    buf.mul_(momentum).add_(combined_grad, alpha=1 - dampening)
+
+                if nesterov:
+                    combined_grad = combined_grad.add(buf, alpha=momentum)
+                else:
+                    combined_grad = buf
+
+            combined_param.add_(combined_grad, alpha=-group['lr'])
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if not hasattr(self, "_amp_stash"):
+            raise RuntimeError('apex.optimizers.NpuFusedSGD should be used with AMP.')
+
+        self._momentum_buffer_already_in_state = True
+        self._check_already_combined_params_and_grads()
+        # combine params and grads first
+        self._combine_params_and_grads_by_group()
+        # then combine param states
+        self._combine_param_states_by_group()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        stash = self._amp_stash
+        for i, group in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
diff --git a/src/csrc/combine_tensors/change_dataptr.cpp b/src/csrc/combine_tensors/change_dataptr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..198b201345ee87edf1e95ca0fcfdefaa01629aab
--- /dev/null
+++ b/src/csrc/combine_tensors/change_dataptr.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/extension.h>
+#include <vector>
+#include <iostream>
+
+
+void change_data_ptr(at::Tensor des, at::Tensor src, int offset)
+{
+  if (src.scalar_type() == at::ScalarType::Half) {
+    at::Half* data_ptr = static_cast<at::Half*>(src.storage().data_ptr().get()) + offset;
+    at::DataPtr aim_data_ptr = at::DataPtr(data_ptr, des.storage().device());
+    des.storage().set_data_ptr(std::move(aim_data_ptr));
+  } else {
+    float* data_ptr = static_cast<float*>(src.storage().data_ptr().get()) + offset;
+    at::DataPtr aim_data_ptr = at::DataPtr(data_ptr, des.storage().device());
+    des.storage().set_data_ptr(std::move(aim_data_ptr));
+  }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("change_data_ptr", &change_data_ptr, "Change data ptr");
+}
diff --git a/src/csrc/npu_float_status/common.h b/src/csrc/npu_float_status/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e578bfcd50ab3518013644b99ba9bd3b32cd4a17
--- /dev/null
+++ b/src/csrc/npu_float_status/common.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+#include <third_party/acl/inc/acl/acl.h>
+#include <third_party/acl/inc/acl/acl_op_compiler.h>
+
+#define RUN_SUCCESS 0
+#define RUN_FAILED 1
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+
+const std::string OP_TYPE_NPU_GET_FLOAT_STATUS = "NPUGetFloatStatus";
+const std::string OP_TYPE_NPU_CLEAR_FLOAT_STATUS = "NPUClearFloatStatus";
+
+const int FLOAT_STATUS_OP_TENSOR_DIMS_SIZE = 8;
+const int FLOAT_STATUS_OVERFLOW = 1;
+
+#endif // COMMON_H
diff --git a/src/csrc/npu_float_status/op_float_status.cpp b/src/csrc/npu_float_status/op_float_status.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e54ff979b9a3f4f4f422afc97c869ca5a0fd937
--- /dev/null
+++ b/src/csrc/npu_float_status/op_float_status.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <iostream>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <torch/extension.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+#include "op_runner.h"
+#include "common.h"
+
+OperatorDesc CreateFloatStatusOpDesc(const std::string opType)
+{
+    std::vector<int64_t> shape{FLOAT_STATUS_OP_TENSOR_DIMS_SIZE};
+    aclDataType dataType = ACL_FLOAT;
+    aclFormat format = ACL_FORMAT_ND;
+    OperatorDesc opDesc(opType);
+    if ((opType == OP_TYPE_NPU_GET_FLOAT_STATUS) ||
+        (opType == OP_TYPE_NPU_CLEAR_FLOAT_STATUS)) {
+        opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format);
+    }
+    opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format);
+    return opDesc;
+}
+
+bool RunGetFloatStatusOp()
+{
+    OperatorDesc opDesc = CreateFloatStatusOpDesc(OP_TYPE_NPU_GET_FLOAT_STATUS);
+
+    OpRunner opRunner(&opDesc);
+    if (opRunner.Init() != RUN_SUCCESS) {
+        ERROR_LOG("Init OpRunner failed");
+        return false;
+    }
+
+    if (opRunner.RunOp() != RUN_SUCCESS) {
+        return false;
+    }
+
+    const float *result = nullptr;
+    bool overflowFlag = false;
+    for (size_t i = 0; i < opRunner.NumInputs(); ++i) {
+        result = opRunner.GetInputBuffer<const float>(i);
+        if (result == nullptr) {
+            ERROR_LOG("opRunner.GetInputBuffer failed");
+            return false;
+        }
+        if (FLOAT_STATUS_OVERFLOW == result[0]) {
+            overflowFlag = true;
+            INFO_LOG("Float status is overflow!");
+        }
+    }
+
+    return overflowFlag;
+}
+
+int RunClearFloatStatusOp()
+{
+    OperatorDesc opDesc = CreateFloatStatusOpDesc(OP_TYPE_NPU_CLEAR_FLOAT_STATUS);
+
+    OpRunner opRunner(&opDesc);
+    if (opRunner.Init() != RUN_SUCCESS) {
+        ERROR_LOG("Init OpRunner failed");
+        return RUN_FAILED;
+    }
+
+    if (opRunner.RunOp() != RUN_SUCCESS) {
+        return RUN_FAILED;
+    }
+
+    return RUN_SUCCESS;
+}
+
+/* This function is used for linking torch/acl .so files */
+at::Tensor TestFlatten(std::vector<at::Tensor> tensors)
+{
+  return torch::utils::flatten_dense_tensors(tensors);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "float status op";
+    m.def("RunGetFloatStatusOp", &RunGetFloatStatusOp, "Run get float status op");
+    m.def("RunClearFloatStatusOp", &RunClearFloatStatusOp, "Run clear float status op");
+    m.def("TestFlatten", &TestFlatten, "Test flatten");
+}
\ No newline at end of file
diff --git a/src/csrc/npu_float_status/op_runner.cpp b/src/csrc/npu_float_status/op_runner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b2eb18d394b15419f4f9fb6d5ad2e155dcf9e7e
--- /dev/null
+++ b/src/csrc/npu_float_status/op_runner.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "op_runner.h"
+#include <limits>
+#include "common.h"
+#include <torch/extension.h>
+#include <ATen/npu/NPUEvent.h>
+
+using namespace std;
+
+OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc)
+{
+    numInputs_ = opDesc->inputDesc.size();
+    numOutputs_ = opDesc->outputDesc.size();
+}
+
+OpRunner::~OpRunner()
+{
+    for (auto *inputBuf : inputBuffers_) {
+        aclDestroyDataBuffer(inputBuf);
+    }
+
+    for (auto *devInput : devInputs_) {
+        aclrtFree(devInput);
+    }
+
+    for (auto *hostInput : hostInputs_) {
+        aclrtFreeHost(hostInput);
+    }
+
+    for (auto *outputBuf : outputBuffers_) {
+        aclDestroyDataBuffer(outputBuf);
+    }
+
+    for (auto *devOutput : devOutputs_) {
+        aclrtFree(devOutput);
+    }
+
+    for (auto *hostOutput : hostOutputs_) {
+        aclrtFreeHost(hostOutput);
+    }
+}
+
+int OpRunner::Init()
+{
+    for (size_t i = 0; i < numInputs_; ++i) {
+        auto size = GetInputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_ERROR_NONE) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return RUN_FAILED;
+        }
+
+        if (aclrtMemset(devMem, size, 0, size) != ACL_ERROR_NONE) {
+            ERROR_LOG("Set device memory for input[%zu] failed", i);
+            return RUN_FAILED;
+        }
+
+        devInputs_.emplace_back(devMem);
+        inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostMem = nullptr;
+        if (aclrtMallocHost(&hostMem, size) != ACL_ERROR_NONE) {
+            ERROR_LOG("Malloc device memory for input[%zu] failed", i);
+            return RUN_FAILED;
+        }
+        if (hostMem == nullptr) {
+            ERROR_LOG("Malloc memory for input[%zu] failed", i);
+            return RUN_FAILED;
+        }
+        hostInputs_.emplace_back(hostMem);
+    }
+
+    for (size_t i = 0; i < numOutputs_; ++i) {
+        auto size = GetOutputSize(i);
+        void *devMem = nullptr;
+        if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_ERROR_NONE) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return RUN_FAILED;
+        }
+
+        if (aclrtMemset(devMem, size, 0, size) != ACL_ERROR_NONE) {
+            ERROR_LOG("Set device memory for output[%zu] failed", i);
+            return RUN_FAILED;
+        }
+
+        devOutputs_.emplace_back(devMem);
+        outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size));
+
+        void *hostOutput = nullptr;
+        if (aclrtMallocHost(&hostOutput, size) != ACL_ERROR_NONE) {
+            ERROR_LOG("Malloc device memory for output[%zu] failed", i);
+            return RUN_FAILED;
+        }
+        if (hostOutput == nullptr) {
+            ERROR_LOG("Malloc host memory for output[%zu] failed", i);
+            return RUN_FAILED;
+        }
+        hostOutputs_.emplace_back(hostOutput);
+    }
+
+    return RUN_SUCCESS;
+}
+
+size_t OpRunner::NumInputs()
+{
+    return numInputs_;
+}
+
+size_t OpRunner::NumOutputs()
+{
+    return numOutputs_;
+}
+
+size_t OpRunner::GetInputSize(size_t index)
+{
+    if (index >= opDesc_->inputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->inputDesc[index]);
+}
+
+size_t OpRunner::GetOutputSize(size_t index)
+{
+    if (index >= opDesc_->outputDesc.size()) {
+        ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+        return 0;
+    }
+
+    return aclGetTensorDescSize(opDesc_->outputDesc[index]);
+}
+
+int OpRunner::RunOp()
+{
+    auto stream = at::npu::getCurrentNPUStream();
+    int holdGIL = PyGILState_Check();
+    aclError ret = ACL_ERROR_NONE;
+
+    if (holdGIL) {
+        Py_BEGIN_ALLOW_THREADS
+        ret = aclopCompileAndExecute(opDesc_->opType.c_str(),
+                                     numInputs_,
+                                     opDesc_->inputDesc.data(),
+                                     inputBuffers_.data(),
+                                     numOutputs_,
+                                     opDesc_->outputDesc.data(),
+                                     outputBuffers_.data(),
+                                     opDesc_->opAttr,
+                                     ACL_ENGINE_SYS,
+                                     ACL_COMPILE_SYS,
+                                     nullptr,
+                                     stream);
+        Py_END_ALLOW_THREADS
+    } else {
+        ret = aclopCompileAndExecute(opDesc_->opType.c_str(),
+                                     numInputs_,
+                                     opDesc_->inputDesc.data(),
+                                     inputBuffers_.data(),
+                                     numOutputs_,
+                                     opDesc_->outputDesc.data(),
+                                     outputBuffers_.data(),
+                                     opDesc_->opAttr,
+                                     ACL_ENGINE_SYS,
+                                     ACL_COMPILE_SYS,
+                                     nullptr,
+                                     stream);
+    }
+
+    if (ret != ACL_ERROR_NONE) {
+        ERROR_LOG("Execute %s failed. ret = %d", opDesc_->opType.c_str(), ret);
+        return RUN_FAILED;
+    }
+
+    if (aclrtSynchronizeStream(stream) != ACL_ERROR_NONE) {
+        ERROR_LOG("Synchronize stream failed");
+        return RUN_FAILED;
+    }
+
+    if (opDesc_->opType == OP_TYPE_NPU_GET_FLOAT_STATUS) {
+        for (size_t i = 0; i < numInputs_; ++i) {
+            auto size = GetInputSize(i);
+            if (aclrtMemcpy(hostInputs_[i], size, devInputs_[i], size, ACL_MEMCPY_DEVICE_TO_HOST) != ACL_ERROR_NONE) {
+                ERROR_LOG("Copy input[%zu] failed", i);
+                return RUN_FAILED;
+            }
+        }
+    }
+
+    return RUN_SUCCESS;
+}
diff --git a/src/csrc/npu_float_status/op_runner.h b/src/csrc/npu_float_status/op_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..81b38b73573108a92bd53625682f7ce79377a5ad
--- /dev/null
+++ b/src/csrc/npu_float_status/op_runner.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef OP_RUNNER_H
+#define OP_RUNNER_H
+
+#include "common.h"
+#include "operator_desc.h"
+
+/**
+ * Op Runner
+ */
+class OpRunner {
+public:
+    /**
+     * @brief Constructor
+     * @param [in] opDesc: op description
+     */
+    explicit OpRunner(OperatorDesc *opDesc);
+
+    /**
+     * @brief Destructor
+     */
+    ~OpRunner();
+
+    /**
+    * @brief Init op runner
+    */
+    int Init();
+
+    /**
+     * @brief Get number of inputs
+     * @return number of inputs
+     */
+    size_t NumInputs();
+
+    /**
+     * @brief Get number of outputs
+     * @return number of outputs
+     */
+    size_t NumOutputs();
+
+    /**
+     * @brief Get input size by index
+     * @param [in] index: input index
+     * @return size of the input
+     */
+    size_t GetInputSize(size_t index);
+
+    /**
+     * @brief Get output size by index
+     * @param [in] index: output index
+     * @return size of the output
+     */
+    size_t GetOutputSize(size_t index);
+
+    /**
+     * @brief Get input buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: input index
+     * @return host address of the input
+     */
+    template<typename T>
+    T *GetInputBuffer(size_t index)
+    {
+        if (index >= numInputs_) {
+            ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_);
+            return nullptr;
+        }
+        return reinterpret_cast<T *>(hostInputs_[index]);
+    }
+
+    /**
+     * @brief Get output buffer(host memory) by index
+     * @tparam T: data type
+     * @param [in] index: output index
+     * @return host address of the output
+     */
+    template<typename T>
+    const T *GetOutputBuffer(size_t index)
+    {
+        if (index >= numOutputs_) {
+            ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_);
+            return nullptr;
+        }
+
+        return reinterpret_cast<T *>(hostOutputs_[index]);
+    }
+
+    /**
+     * @brief Run op
+     * @return run result
+     */
+    int RunOp();
+
+private:
+    size_t numInputs_;
+    size_t numOutputs_;
+
+    std::vector<const aclDataBuffer *> inputBuffers_;
+    std::vector<aclDataBuffer *> outputBuffers_;
+
+    std::vector<void *> devInputs_;
+    std::vector<void *> devOutputs_;
+
+    std::vector<void *> hostInputs_;
+    std::vector<void *> hostOutputs_;
+    OperatorDesc *opDesc_;
+};
+
+#endif // OP_RUNNER_H
diff --git a/src/csrc/npu_float_status/operator_desc.cpp b/src/csrc/npu_float_status/operator_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f55e6efdbe79383d82ccfa7696b9a82c0bff1979
--- /dev/null
+++ b/src/csrc/npu_float_status/operator_desc.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.h"
+#include "operator_desc.h"
+
+using namespace std;
+
+OperatorDesc::OperatorDesc(std::string opType) : opType(std::move(opType))
+{
+    opAttr = aclopCreateAttr();
+}
+
+OperatorDesc::~OperatorDesc()
+{
+    for (auto *desc : inputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    for (auto *desc : outputDesc) {
+        aclDestroyTensorDesc(desc);
+    }
+
+    aclopDestroyAttr(opAttr);
+}
+
+OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType,
+                                               int numDims,
+                                               const int64_t *dims,
+                                               aclFormat format)
+{
+    if (numDims > 0 && dims == nullptr) {
+        ERROR_LOG("dims is nullptr while numDims > 0");
+        return *this;
+    }
+    inputDesc.push_back(aclCreateTensorDesc(dataType, numDims, dims, format));
+    return *this;
+}
+
+OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType,
+                                                int numDims,
+                                                const int64_t *dims,
+                                                aclFormat format)
+{
+    if (numDims > 0 && dims == nullptr) {
+        ERROR_LOG("dims is nullptr while numDims > 0");
+        return *this;
+    }
+
+    outputDesc.push_back(aclCreateTensorDesc(dataType, numDims, dims, format));
+    return *this;
+}
diff --git a/src/csrc/npu_float_status/operator_desc.h b/src/csrc/npu_float_status/operator_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..9035ff71d376cdfae693f023e7559bd0a287ba00
--- /dev/null
+++ b/src/csrc/npu_float_status/operator_desc.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies.All rights reserved.
+ * 
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef OPERATOR_DESC_H
+#define OPERATOR_DESC_H
+
+#include <string>
+#include <vector>
+
+#include <third_party/acl/inc/acl/acl.h>
+
+/**
+ * Op description
+ */
+struct OperatorDesc {
+    /**
+     * Constructor
+     * @param [in] opType: op type
+     */
+    explicit OperatorDesc(std::string opType);
+
+    /**
+     * Destructor
+     */
+    ~OperatorDesc();
+
+    /**
+     * Add an input tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    /**
+     * Add an output tensor description
+     * @param [in] dataType: data type
+     * @param [in] numDims: number of dims
+     * @param [in] dims: dims
+     * @param [in] format: format
+     * @return OperatorDesc
+     */
+    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
+
+    std::string opType;
+    std::vector<const aclTensorDesc *> inputDesc;
+    std::vector<const aclTensorDesc *> outputDesc;
+    aclopAttr *opAttr;
+};
+
+#endif // OPERATOR_DESC_H
diff --git a/tests/L0/device.py b/tests/L0/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f103cc0413d195559f4f34e4d45352e73a82dab
--- /dev/null
+++ b/tests/L0/device.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+# 
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+if torch.cuda.is_available():
+    CALCULATE_DEVICE = 'cuda'
+else:
+    CALCULATE_DEVICE = 'npu'
+
+def is_npu():
+    return CALCULATE_DEVICE.find('npu') != -1
\ No newline at end of file
diff --git a/tests/L0/run_optimizers/test_lamb_mnist.py b/tests/L0/run_optimizers/test_lamb_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..be37ead317d8900e1f99c8706ad66a6022ef5b23
--- /dev/null
+++ b/tests/L0/run_optimizers/test_lamb_mnist.py
@@ -0,0 +1,176 @@
+# This is based on pytorch-lamb (https://github.com/cybertronai/pytorch-lamb).
+#
+# Copyright (c) 2021, Huawei Technologies. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MNIST example.
+
+Based on https://github.com/pytorch/examples/blob/master/mnist/main.py
+"""
+
+from __future__ import print_function
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tqdm
+import apex
+import collections
+from tensorboardX import SummaryWriter
+from torchvision import datasets, transforms
+from apex import amp
+from torch.optim import Optimizer
+
+CALCULATE_DEVICE = "npu:0"
+SOURCE_DIR = "/home/data/"
+EPS = 0.97
+def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int):
+    """Log a histogram of trust ratio scalars in across layers."""
+    results = collections.defaultdict(list)
+    for group in optimizer.param_groups:
+        for p in group['params']:
+            state = optimizer.state[p]
+            for i in ('weight_norm', 'adam_norm', 'trust_ratio'):
+                if i in state:
+                    results[i].append(state[i])
+
+    for k, v in results.items():
+        event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count)
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = nn.Linear(4*4*50, 500)
+        self.fc2 = nn.Linear(500, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4*4*50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+    
+def train(args, model, device, train_loader, optimizer, epoch, event_writer):
+    model.train()
+    tqdm_bar = tqdm.tqdm(train_loader)
+    for batch_idx, (data, target) in enumerate(tqdm_bar):
+        data = data.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        output = output.to("cpu").to(torch.float)
+        loss = F.nll_loss(output, target)
+        loss = loss.to(device).to(torch.float16)
+
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            step = batch_idx * len(data) + (epoch-1) * len(train_loader.dataset)
+            log_lamb_rs(optimizer, event_writer, step)
+            event_writer.add_scalar('loss', loss.item(), step)
+            tqdm_bar.set_description(
+                f'Train epoch {epoch} Loss: {loss.item():.6f}')
+
+def test(args, model, device, test_loader, event_writer:SummaryWriter, epoch):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data = data.to(device)
+            output = model(data)
+            output = output.to("cpu")
+            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+    acc = correct / len(test_loader.dataset)
+    event_writer.add_scalar('loss/test_loss', test_loss, epoch - 1)
+    event_writer.add_scalar('loss/test_acc', acc, epoch - 1)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * acc))
+    if acc < EPS:
+       raise Exception("Accuracy dose not meet expect!")
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--optimizer', type=str, default='lamb', choices=['lamb', 'adam'],
+                        help='which optimizer to use')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=6, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.0025, metavar='LR',
+                        help='learning rate (default: 0.0025)')
+    parser.add_argument('--wd', type=float, default=0.01, metavar='WD',
+                        help='weight decay (default: 0.01)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--npu', type=int, default=None, 
+                        help='NPU id to use')
+    parser.add_argument('--data', type=str, default=SOURCE_DIR, help='path of dataset')
+ 
+    args = parser.parse_args()
+    use_cuda = torch.cuda.is_available()
+
+    torch.manual_seed(args.seed)
+
+    global CALCULATE_DEVICE
+    if args.npu is not None:
+        CALCULATE_DEVICE = "npu:{}".format(args.npu)
+    torch.npu.set_device(CALCULATE_DEVICE)
+    device = CALCULATE_DEVICE
+    print("use ", CALCULATE_DEVICE)
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(args.data, train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.batch_size, shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(args.data, train=False, transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+
+    model = Net().to(device)
+    optimizer = apex.optimizers.Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.9, .999), 
+                                     adam=(args.optimizer == 'adam'))
+    model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1)
+    writer = SummaryWriter()
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch, writer)
+        test(args, model, device, test_loader, writer, epoch)
+
+ 
+if __name__ == '__main__':
+    main()
diff --git a/tests/L1/common/compare_npu.py b/tests/L1/common/compare_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..82af5534b06b59563e1448f5fe1a5bbafdd409f2
--- /dev/null
+++ b/tests/L1/common/compare_npu.py
@@ -0,0 +1,77 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the BSD 3-Clause License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://spdx.org/licenses/BSD-3-Clause.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import torch
+import itertools
+
+EPS = 0.05
+err_count = 0
+
+def compare_loss(loss_cmp, loss_base):
+    temp = (loss_cmp - loss_base) / loss_base
+    if (loss_cmp < loss_base) or temp < EPS:
+        return True, temp
+    else:
+        return False, temp
+
+parser = argparse.ArgumentParser(description='Compare')
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--loss-scale', type=str, default=None)
+args = parser.parse_args()
+
+base_file = "_" + str(args.opt_level) + "_" +\
+            str(args.loss_scale) + "_"
+
+file_names = []
+file_descs = []
+dict_datas = []
+
+for comb in itertools.product(['True','False'], ['True','False']):
+    file_name = comb[0] + base_file + comb[1]
+    file_names.append(file_name)
+    dict_datas.append(torch.load(file_name))
+    file_desc = "combine_grad=" + comb[0] + " opt_level=" + args.opt_level +\
+                " loss-scale=" + args.loss_scale +  " npu_fused_sgd=" + comb[1]
+    file_descs.append(file_desc)
+
+torch.set_printoptions(precision=10)
+
+print()
+opt_str = "opt_level=" + args.opt_level + " loss-scale=" + args.loss_scale
+print("Compare with baseline: combine_grad=False " + opt_str + " npu_fused_sgd=False EPS", EPS)
+print()
+
+for n, (i_e, i_p) in enumerate(zip(dict_datas[0]["Iteration"], dict_datas[1]["Iteration"])):
+    print("step = {}:".format(i_e))
+    assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
+
+    loss_base = dict_datas[3]["Loss"][n]
+    for file_name, dict_data, file_desc in zip(file_names, dict_datas, file_descs):
+        if file_name == file_names[3]:
+            break
+        else:
+            loss = dict_data["Loss"][n]
+            result = file_desc + " loss :{:.3f}".format(loss) + " loss base:{:.3f}".format(loss_base)
+            res, ratio = compare_loss(loss, loss_base)
+            if res == False:
+                err_count = err_count + 1
+                result = result + " ratio:{:.3f}, loss compare Failed".format(ratio)
+            else:
+                result = result + " ratio:{:.3f}, loss compare Ok".format(ratio)
+            print(result)
+if (err_count > 0):
+    raise Exception("Loss compare failed!")
+
diff --git a/tests/L1/common/main_amp_npu.py b/tests/L1/common/main_amp_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba1075bb204ca1edb342ccbc3b776d4773943a2
--- /dev/null
+++ b/tests/L1/common/main_amp_npu.py
@@ -0,0 +1,702 @@
+# -*- coding: utf-8 -*-
+'''
+BSD 3-Clause License
+
+Copyright (c) Soumith Chintala 2016,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the BSD 3-Clause License (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://spdx.org/licenses/BSD-3-Clause.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+from apex import amp, optimizers
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+import torch.npu
+
+BATCH_SIZE = 512
+EPOCHS_SIZE = 100
+TRAIN_STEP = 8000
+LOG_STEP = 1
+
+CALCULATE_DEVICE = "npu:0"
+PRINT_DEVICE = "cpu"
+SOURCE_DIR = "/data/imagenet"
+
+model_names = sorted(name for name in models.__dict__
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data',
+                    metavar='DIR',
+                    default=SOURCE_DIR,
+                    help='path to dataset')
+parser.add_argument('-a', '--arch',
+                    metavar='ARCH',
+                    default='resnet50',
+                    choices=model_names,
+                    help='model architecture: ' +
+                         ' | '.join(model_names) +
+                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers',
+                    default=32,
+                    type=int,
+                    metavar='N',
+                    help='number of data loading workers (default: 8)')
+parser.add_argument('--epochs',
+                    default=EPOCHS_SIZE,
+                    type=int,
+                    metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch',
+                    default=0,
+                    type=int,
+                    metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size',
+                    default=BATCH_SIZE,
+                    type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate',
+                    default=0.1,
+                    type=float,
+                    metavar='LR',
+                    help='initial learning rate',
+                    dest='lr')
+parser.add_argument('--momentum',
+                    default=0.9,
+                    type=float,
+                    metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay',
+                    default=1e-4,
+                    type=float,
+                    metavar='W',
+                    help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq',
+                    default=10,
+                    type=int,
+                    metavar='N',
+                    help='print frequency (default: 10)')
+parser.add_argument('--resume',
+                    default='',
+                    type=str,
+                    metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate',
+                    dest='evaluate',
+                    action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained',
+                    dest='pretrained',
+                    action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size',
+                    default=1,
+                    type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank',
+                    default=-1,
+                    type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url',
+                    default='tcp://0.0.0.0:23456',
+                    type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend',
+                    default='nccl',
+                    type=str,
+                    help='distributed backend')
+parser.add_argument('--seed',
+                    default=None,
+                    type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu',
+                    default=None,
+                    type=int,
+                    help='GPU id to use.')
+parser.add_argument('--npu',
+                    default=None,
+                    type=int,
+                    help='NPU id to use.')
+parser.add_argument('--multiprocessing-distributed',
+                    action='store_true')
+parser.add_argument('--warmup',
+                    default=0,
+                    type=int,
+                    metavar='E',
+                    help='number of warmup epochs')
+parser.add_argument('--label-smoothing',
+                    default=0.0,
+                    type=float,
+                    metavar='S',
+                    help='label smoothing')
+parser.add_argument('--optimizer-batch-size',
+                    default=-1,
+                    type=int,
+                    metavar='N',
+                    help=
+                    'size of a total batch size, for simulating bigger batches using gradient accumulation')
+parser.add_argument('--static-loss-scale',
+                    type=float,
+                    default=1,
+                    help=
+                    'Static loss scale, positive power of 2 values can improve fp16 convergence.')
+
+parser.add_argument('--deterministic', action='store_true')
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+parser.add_argument('--loss-scale', type=str, default=None)
+parser.add_argument('--npu-fused-sgd', action='store_true')
+parser.add_argument('--combine-grad', action='store_true')
+parser.add_argument('--local-rank', default=0, type=int)
+parser.add_argument('--prints-to-process', type=int, default=10)
+
+best_acc1 = 0
+
+def main():
+    args = parser.parse_args()
+    if args.deterministic:
+        random.seed(args.local_rank)
+        torch.manual_seed(args.local_rank)
+        torch.set_printoptions(precision=10)
+
+    global CALCULATE_DEVICE
+    if args.npu is None:
+        args.npu = 0
+    else:
+        CALCULATE_DEVICE = "npu:{}".format(args.npu)
+        torch.npu.set_device(CALCULATE_DEVICE)
+    print("use ", CALCULATE_DEVICE)
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    ngpus_per_node = torch.npu.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](zero_init_residual=True)
+    for layer in model.modules():
+        if isinstance(layer, nn.Linear):
+            torch.nn.init.kaiming_normal_(layer.weight, a=math.sqrt(5), )
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model.cuda(args.gpu)
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs we have
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        else:
+            model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None:
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = model.to(CALCULATE_DEVICE)
+
+    lr_policy = lr_cosine_policy(args.lr,
+                                 args.warmup,
+                                 args.epochs)
+
+
+    # define loss function (criterion) and optimizer
+    loss = nn.CrossEntropyLoss
+    if args.label_smoothing > 0.0:
+        loss = lambda: LabelSmoothing(args.label_smoothing)
+    criterion = loss().to(CALCULATE_DEVICE)
+    if args.npu_fused_sgd:
+        optimizer = optimizers.NpuFusedSGD([
+            {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0},
+            {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], 'weight_decay': args.weight_decay}],
+                                    args.lr,
+                                    momentum=args.momentum)
+    else:
+        optimizer = torch.optim.SGD([
+            {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0},
+            {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], 'weight_decay': args.weight_decay}],
+                                    args.lr,
+                                    momentum=args.momentum)
+    
+    model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=args.combine_grad, verbosity=1)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.npu is not None:
+                checkpoint = torch.load(args.resume)
+            elif args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.npu is not None:
+                best_acc1 = best_acc1.to("npu:{}".format(args.npu))
+            elif args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            #optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        # adjust_learning_rate(optimizer, epoch, args)
+        lr_policy(optimizer, 0, epoch)
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+        file_name = "checkpoint_npu{}".format(args.npu)
+        modeltmp = model.cpu()
+        save_checkpoint({
+            'epoch': epoch + 1,
+            'arch': args.arch,
+            'state_dict': modeltmp.state_dict(),
+            # 'state_dict': model,
+            'best_acc1': best_acc1.to("cpu"),
+            # 'optimizer' : optimizer.state_dict(),
+        }, is_best.to("cpu"), file_name)
+        modeltmp.to(CALCULATE_DEVICE)
+
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    if args.optimizer_batch_size < 0:
+        batch_size_multiplier = 1
+    else:
+        tbs = 1 * args.batch_size
+        if args.optimizer_batch_size % tbs != 0:
+            print(
+                "Warning: simulated batch size {} is not divisible by actual batch size {}"
+                    .format(args.optimizer_batch_size, tbs))
+        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
+        print("BSM: {}".format(batch_size_multiplier))
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+    optimizer.zero_grad()
+    end = time.time()
+
+    run_info_dict = {"Iteration" : [],
+                     "Loss" : [],
+                     "Speed" : []}
+
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            images = images.cuda(args.gpu, non_blocking=True)
+
+        images = images.to(CALCULATE_DEVICE, non_blocking=True)
+        if args.label_smoothing == 0.0:
+            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        if args.label_smoothing > 0.0:
+            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+        if optimizer_step:
+            if batch_size_multiplier != 1:
+                for param_group in optimizer.param_groups:
+                    for param in param_group['params']:
+                        param.grad /= batch_size_multiplier
+            optimizer.step()
+            optimizer.zero_grad()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if i % args.print_freq == 0 and i > 1:
+            if args.local_rank == 0:
+                print('Epoch: [{0}][{1}/{2}]\t'
+                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                      'Speed {3:.3f} ({4:.3f})\t'
+                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                      'Loss {loss.val:.10f} ({loss.avg:.4f})\t'
+                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                       epoch, i, len(train_loader),
+                       args.world_size * args.batch_size / batch_time.val,
+                       args.world_size * args.batch_size / batch_time.avg,
+                       batch_time=batch_time,
+                       data_time=data_time, loss=losses, top1=top1, top5=top5))
+            run_info_dict["Iteration"].append(i)
+            run_info_dict["Loss"].append(losses.val)
+            run_info_dict["Speed"].append(args.world_size * args.batch_size / batch_time.val)
+            if len(run_info_dict["Loss"]) == args.prints_to_process:
+                if args.local_rank == 0:
+                    torch.save(run_info_dict,
+                               str(args.combine_grad) + "_" + str(args.opt_level) + "_" +
+                               str(args.loss_scale) + "_" + str(args.npu_fused_sgd))
+                quit()
+
+        if i % LOG_STEP == 0:
+            progress.display(i)
+
+        if i == TRAIN_STEP:
+            break
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            if args.gpu is not None:
+                images = images.cuda(args.gpu, non_blocking=True)
+            images = images.to(CALCULATE_DEVICE, non_blocking=True)
+            if args.label_smoothing == 0.0:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            if args.label_smoothing > 0.0:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % LOG_STEP == 0:
+                progress.display(i)
+
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+    return top1.avg
+
+def save_checkpoint(state, is_best, filename='checkpoint'):
+    filename2 = filename + ".pth.tar"
+    torch.save(state, filename2)
+    if is_best:
+        shutil.copyfile(filename2, filename+'model_best.pth.tar')
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+        self.start_count_index = 10
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        if self.count == 0:
+            self.batchsize = n
+        self.val = val
+        self.count += n
+        if self.count > (self.start_count_index * self.batchsize):
+            self.sum += val * n
+            self.avg = self.sum / (self.count - self.start_count_index * self.batchsize)
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean().to(CALCULATE_DEVICE)
+
+def lr_policy(lr_fn, logger=None):
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+
+        if logger is not None:
+            logger.log_metric('lr', lr)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+    return _alr
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/L1/common/run_test_npu.sh b/tests/L1/common/run_test_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..841e8306d3e1ed7fb8511b9045af12784e102982
--- /dev/null
+++ b/tests/L1/common/run_test_npu.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+print_banner() {
+  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
+}
+
+print_banner "Distributed status:  $1"
+
+echo $2
+DATADIR=$2
+
+if [ "$1" == "single_npu" ]
+then
+  BASE_CMD="python main_amp_npu.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5 "
+fi
+if [ $# == 3 ]
+then
+  BASE_CMD="${BASE_CMD} --npu $3"
+fi
+
+combine_grads=(
+""
+"--combine-grad"
+)
+
+npu_fused_sgds=(
+""
+"--npu-fused-sgd"
+)
+
+loss_scales=(
+"--loss-scale 1024.0"
+)
+
+opt_levels=(
+"O1"
+"O2"
+)
+
+rm True*
+rm False*
+
+set -e
+
+for opt_level in "${opt_levels[@]}"
+do
+  for loss_scale in "${loss_scales[@]}"
+  do
+    for combine_grad in "${combine_grads[@]}"
+    do
+      for npu_fused_sgd in "${npu_fused_sgds[@]}"
+      do
+        print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${combine_grad} ${npu_fused_sgd} $DATADIR"
+        set -x
+        ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${combine_grad} ${npu_fused_sgd} $DATADIR
+        set +x
+      done
+    done
+  done
+done
+
+for opt_level in "${opt_levels[@]}"
+do
+  for loss_scale in "${loss_scales[@]}"
+  do
+    echo "compare_npu.py --opt-level ${opt_level} ${loss_scale}"
+    set -x
+    python compare_npu.py --opt-level ${opt_level} ${loss_scale}
+    set +x
+  done
+done