From 6dc30ee5b4a4b68a3bd79e92d5e7e62bf74481be Mon Sep 17 00:00:00 2001
From: xiaxia3 <wangxiaxia3@huawei.com>
Date: Thu, 11 Aug 2022 13:11:14 +0000
Subject: [PATCH] modify container_abcs to support pytorch v1.11.0

---
 patch/npu.patch | 563 +++++++++++++++++++++++++-----------------------
 1 file changed, 290 insertions(+), 273 deletions(-)

diff --git a/patch/npu.patch b/patch/npu.patch
index b20b3d4..653d07f 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,225 +1,41 @@
-diff -Nur '--exclude=.git' apex/apex/amp/amp.py apex-develop/apex/amp/amp.py
---- apex/apex/amp/amp.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/amp.py	2022-05-13 10:49:00.196000000 +0800
-@@ -65,7 +65,14 @@
- 
- 
- # Top-level function to insert _all_ the hooks.
--def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
-+def init(
-+    enabled=True,
-+    loss_scale="dynamic",
-+    enable_caching=True,
-+    verbose=False,
-+    allow_banned=False,
-+    user_cast_preferred=None):
-+
-     global _DECORATOR_HANDLE
- 
-     if not enabled:
-@@ -76,7 +83,10 @@
-     handle = AmpHandle(loss_scale, enable_caching, verbose)
- 
-     # 0) Force-{fp16, fp32} for user-annotated functions
-+    _user_cast_registry = set()
-     for mod, fn, cast_fn in _USER_CAST_REGISTRY:
-+        if user_cast_preferred:
-+            _user_cast_registry.add((mod, fn))
-         try_caching = (cast_fn == utils.maybe_half)
-         wrap.cached_cast(mod, fn, cast_fn, handle,
-                          try_caching, verbose)
-@@ -96,6 +106,8 @@
-     for module, (list_name, cast_fn) in itertools.product(override_modules,
-                                                           cast_table):
-         for fn in getattr(module, list_name):
-+            if user_cast_preferred and (module.MODULE, fn) in _user_cast_registry:
-+                continue
-             try_caching = (cast_fn == utils.maybe_half)
-             wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
-                              try_caching, verbose)
-diff -Nur '--exclude=.git' apex/apex/amp/frontend.py apex-develop/apex/amp/frontend.py
---- apex/apex/amp/frontend.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/frontend.py	2022-05-13 10:49:00.196000000 +0800
-@@ -19,6 +19,11 @@
-             "keep_batchnorm_fp32" : None,
-             "master_weights" : None,
-             "loss_scale" : 1.0,
-+            "combine_grad": None,
-+            "combine_ddp": None,
-+            "ddp_replica_count": 4,
-+            "check_combined_tensors": None,
-+            "user_cast_preferred":None,
-             # Reserved for future functionality
-             # "fused_optimizer" : False,
-             # "enable_ddp_interop" : False,
-@@ -91,6 +96,20 @@
-                         self.options[name] = value
-                     else:
-                         self.options[name] = float(value)
-+                elif name == "combine_grad" or name == "check_combined_tensors":
-+                    if self.opt_level not in ["O1", "O2"] and value:
-+                        warn_or_err("Currently, combine_grad=True or check_combined_tensors=True should only be set "
-+                                    "by selecting opt_level='O1' or opt_level='O2'.")
-+                    self.options[name] = value
-+                elif name == "combine_ddp":
-+                    if not self.combine_grad:
-+                        warn_or_err("Combine_grad should be True when combine_ddp using.. \n")
-+                    self.options[name] = value
-+                elif name == "user_cast_preferred":
-+                    if self.opt_level != "O1" and value:
-+                        warn_or_err("Currently, user_cast_preferred=True should only be set by "
-+                                    "selecting opt_level='O1'.")
-+                    self.options[name] = value
-                 else:
-                     self.options[name] = value
-         else:
-@@ -161,6 +180,7 @@
-         properties.keep_batchnorm_fp32 = None
-         properties.master_weights = None
-         properties.loss_scale = "dynamic"
-+        properties.combine_grad = None
-         # properties.fused_optimizer = False
-         # properties.enable_ddp_interop = False
-         return properties # modified in place so this isn't really necessary
-@@ -205,8 +225,17 @@
-     cast_model_outputs=None,
-     num_losses=1,
-     verbosity=1,
-+    dynamic_init_scale=2.**16,
-+    scale_growth_factor=2.,
-+    scale_backoff_factor=0.5,
-+    scale_window=2000,
-     min_loss_scale=None,
--    max_loss_scale=2.**24
-+    max_loss_scale=2.**24,
-+    combine_grad=None,
-+    combine_ddp=None,
-+    ddp_replica_count=4,
-+    user_cast_preferred=None,
-+    check_combined_tensors=None
-     ):
-     """
-     Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
-@@ -254,11 +283,32 @@
-             support multiple losses/backward passes, but use a single global loss scale
-             for all of them.
-         verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
-+        dynamic_init_scale (float, optional, default=2.**16):  Initial dynamic loss scale factor.
-+        scale_growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied
-+            if no overflow occurs for ``scale_window`` consecutive iterations.
-+            If dynamic loss scaling is not used, `scale_growth_factor` is ignored.
-+        scale_backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied
-+            if overflow occurs in an iteration. If dynamic loss scaling is not used, `scale_backoff_factor` is ignored.
-+        scale_window (int, optional, default=2000):  Number of consecutive iterations without overflow
-+            that must occur for the scale to be multiplied by ``scale_growth_factor``.
-+            If dynamic loss scaling is not used, `scale_window` is ignored.
-         min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
-             loss scaling.  The default value of None means that no floor is imposed.
-             If dynamic loss scaling is not used, `min_loss_scale` is ignored.
-         max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
-             dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
-+        combine_grad (bool, optional, default=None): If True, make gradients fused for unscale.
-+        combine_ddp (bool, optional, default=None): If True, use combined gradients for data exchange,
-+            accelerate multi-card training, and functionally replace DistributedDataParallel.
-+        ddp_replica_count (bool, optional, default=4): Set the number of replicas of combined gradients.
-+            Theoretically, the more replicas, the higher the degree of parallelism, but the time-consuming
-+            distribution operation itself will lead to a decrease in performance even though the degree
-+            of parallelism is improved. Therefore, we limit and optimize the replica size for data exchange.
-+            The final number of replicas is not necessarily exactly the same as the set number
-+        user_cast_preferred (bool, optional, default=None): If True in O1, user cast registry is preferred
-+            rather than fp16 white- / black-list, to avoid redundant dtype cast.
-+        check_combined_tensors (bool, optional, default=None): If True, check if the combined grads and combined params
-+            are valid during training
- 
-     Returns:
-         Model(s) and optimizer(s) modified according to the ``opt_level``.
-@@ -306,6 +356,7 @@
-         https://github.com/NVIDIA/apex/issues
-     """
-     _amp_state.opt_properties = Properties()
-+    # Here add a switch to open combine tensor
-     _amp_state.verbosity = verbosity
- 
-     if not enabled:
-@@ -330,6 +381,10 @@
-         for k, v in _amp_state.opt_properties.options.items():
-             maybe_print("{:22} : {}".format(k, v), True)
- 
-+    _amp_state.dynamic_init_scale = dynamic_init_scale
-+    _amp_state.scale_growth_factor = scale_growth_factor
-+    _amp_state.scale_backoff_factor = scale_backoff_factor
-+    _amp_state.scale_window = scale_window
-     _amp_state.min_loss_scale = min_loss_scale
-     _amp_state.max_loss_scale = max_loss_scale
- 
-@@ -350,6 +405,16 @@
-         _amp_state.opt_properties.master_weights = master_weights
-     if loss_scale is not None:
-         _amp_state.opt_properties.loss_scale = loss_scale
-+    if combine_grad is not None:
-+        _amp_state.opt_properties.combine_grad = combine_grad
-+    if combine_ddp is not None:
-+        _amp_state.opt_properties.combine_ddp = combine_ddp
-+    if ddp_replica_count is not None:
-+        _amp_state.opt_properties.ddp_replica_count = ddp_replica_count
-+    if user_cast_preferred is not None:
-+        _amp_state.opt_properties.user_cast_preferred = user_cast_preferred
-+    if check_combined_tensors is not None:
-+        _amp_state.opt_properties.check_combined_tensors = check_combined_tensors
- 
-     maybe_print("After processing overrides, optimization options are:", True)
-     for k, v in _amp_state.opt_properties.options.items():
-diff -Nur '--exclude=.git' apex/apex/amp/handle.py apex-develop/apex/amp/handle.py
---- apex/apex/amp/handle.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/handle.py	2022-05-13 10:49:00.196000000 +0800
-@@ -1,3 +1,19 @@
-+# Copyright (c) 2020, Huawei Technologies.
-+# Copyright (c) 2019, NVIDIA CORPORATION.
-+# All rights reserved.
-+#
-+# Licensed under the BSD 3-Clause License  (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+# https://opensource.org/licenses/BSD-3-Clause
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+
- import contextlib
- import warnings
- import sys
-@@ -110,6 +126,9 @@
-                 if not optimizer._amp_stash.params_have_scaled_gradients:
-                     optimizer._prepare_amp_backward()
+diff -Nur '--exclude=.git' apex/.gitignore apex-develop/.gitignore
+--- apex/.gitignore	2022-08-11 12:25:49.507879872 +0000
++++ apex-develop/.gitignore	1970-01-01 00:00:00.000000000 +0000
+@@ -1,5 +0,0 @@
+-apex.egg-info
+-dist
+-build
+-docs/build
+-*~
+\ No newline at end of file
+diff -Nur '--exclude=.git' apex/.gitmodules apex-develop/.gitmodules
+--- apex/.gitmodules	2022-08-11 12:25:49.507879872 +0000
++++ apex-develop/.gitmodules	1970-01-01 00:00:00.000000000 +0000
+@@ -1,4 +0,0 @@
+-[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
+-	path = apex/contrib/csrc/multihead_attn/cutlass
+-	url = https://github.com/NVIDIA/cutlass.git
+-	branch = v1.2.0
+diff -Nur '--exclude=.git' apex/apex/amp/_amp_state.py apex-develop/apex/amp/_amp_state.py
+--- apex/apex/amp/_amp_state.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/_amp_state.py	2022-08-11 12:30:58.659866271 +0000
+@@ -8,10 +8,10 @@
+ TORCH_MAJOR = int(torch.__version__.split('.')[0])
+ TORCH_MINOR = int(torch.__version__.split('.')[1])
+ 
+-if TORCH_MAJOR == 0:
+-    import collections.abc as container_abcs
+-else:
++if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+     from torch._six import container_abcs
++else:
++    import collections.abc as container_abcs
  
-+    if loss_scaler.dynamic:
-+        LossScaler.clear_npu_overflow_flag()
-+
-     yield (loss.float())*loss_scale
  
-     if delay_unscale:
-@@ -142,8 +161,12 @@
-                                 # Maybe skip should delegate to a method owned by the optimizers themselves.
-                                 if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
-                                     # Clear the master grads that wouldn't be zeroed by model.zero_grad()
--                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
--                                        param.grad = None
-+                                    if opt.accelerate or opt.is_npu_fused_optimizer:
-+                                        if opt._amp_stash.main_fp32_from_fp16_grad_combine is not None:
-+                                            opt._amp_stash.main_fp32_from_fp16_grad_combine.zero_()
-+                                    else:
-+                                        for param in opt._amp_stash.all_fp32_from_fp16_params:
-+                                            param.grad = None
-                                 if hasattr(opt, "most_recent_scale"):
-                                     opt.most_recent_scale = 1.0
-                                     opt.scale_set_by_backward = False
+ class AmpState(object):
 diff -Nur '--exclude=.git' apex/apex/amp/_initialize.py apex-develop/apex/amp/_initialize.py
---- apex/apex/amp/_initialize.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/_initialize.py	2022-05-13 10:49:00.196000000 +0800
+--- apex/apex/amp/_initialize.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/_initialize.py	2022-08-11 12:30:58.659866271 +0000
 @@ -1,9 +1,26 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -337,8 +153,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/_initialize.py apex-develop/apex/amp/_i
          if models_was_list:
              return models, optimizers
 diff -Nur '--exclude=.git' apex/apex/amp/_process_optimizer.py apex-develop/apex/amp/_process_optimizer.py
---- apex/apex/amp/_process_optimizer.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/_process_optimizer.py	2022-05-13 10:49:00.196000000 +0800
+--- apex/apex/amp/_process_optimizer.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/_process_optimizer.py	2022-08-11 12:30:58.659866271 +0000
 @@ -1,9 +1,77 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -1795,9 +1611,228 @@ diff -Nur '--exclude=.git' apex/apex/amp/_process_optimizer.py apex-develop/apex
                      stash.all_fp32_params.append(param)
                      stash.all_fp32_grad_stash.append(None)
                  else:
+diff -Nur '--exclude=.git' apex/apex/amp/amp.py apex-develop/apex/amp/amp.py
+--- apex/apex/amp/amp.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/amp.py	2022-08-11 12:30:58.659866271 +0000
+@@ -65,7 +65,14 @@
+ 
+ 
+ # Top-level function to insert _all_ the hooks.
+-def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
++def init(
++    enabled=True,
++    loss_scale="dynamic",
++    enable_caching=True,
++    verbose=False,
++    allow_banned=False,
++    user_cast_preferred=None):
++
+     global _DECORATOR_HANDLE
+ 
+     if not enabled:
+@@ -76,7 +83,10 @@
+     handle = AmpHandle(loss_scale, enable_caching, verbose)
+ 
+     # 0) Force-{fp16, fp32} for user-annotated functions
++    _user_cast_registry = set()
+     for mod, fn, cast_fn in _USER_CAST_REGISTRY:
++        if user_cast_preferred:
++            _user_cast_registry.add((mod, fn))
+         try_caching = (cast_fn == utils.maybe_half)
+         wrap.cached_cast(mod, fn, cast_fn, handle,
+                          try_caching, verbose)
+@@ -96,6 +106,8 @@
+     for module, (list_name, cast_fn) in itertools.product(override_modules,
+                                                           cast_table):
+         for fn in getattr(module, list_name):
++            if user_cast_preferred and (module.MODULE, fn) in _user_cast_registry:
++                continue
+             try_caching = (cast_fn == utils.maybe_half)
+             wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
+                              try_caching, verbose)
+diff -Nur '--exclude=.git' apex/apex/amp/frontend.py apex-develop/apex/amp/frontend.py
+--- apex/apex/amp/frontend.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/frontend.py	2022-08-11 12:30:58.659866271 +0000
+@@ -19,6 +19,11 @@
+             "keep_batchnorm_fp32" : None,
+             "master_weights" : None,
+             "loss_scale" : 1.0,
++            "combine_grad": None,
++            "combine_ddp": None,
++            "ddp_replica_count": 4,
++            "check_combined_tensors": None,
++            "user_cast_preferred":None,
+             # Reserved for future functionality
+             # "fused_optimizer" : False,
+             # "enable_ddp_interop" : False,
+@@ -91,6 +96,20 @@
+                         self.options[name] = value
+                     else:
+                         self.options[name] = float(value)
++                elif name == "combine_grad" or name == "check_combined_tensors":
++                    if self.opt_level not in ["O1", "O2"] and value:
++                        warn_or_err("Currently, combine_grad=True or check_combined_tensors=True should only be set "
++                                    "by selecting opt_level='O1' or opt_level='O2'.")
++                    self.options[name] = value
++                elif name == "combine_ddp":
++                    if not self.combine_grad:
++                        warn_or_err("Combine_grad should be True when combine_ddp using.. \n")
++                    self.options[name] = value
++                elif name == "user_cast_preferred":
++                    if self.opt_level != "O1" and value:
++                        warn_or_err("Currently, user_cast_preferred=True should only be set by "
++                                    "selecting opt_level='O1'.")
++                    self.options[name] = value
+                 else:
+                     self.options[name] = value
+         else:
+@@ -161,6 +180,7 @@
+         properties.keep_batchnorm_fp32 = None
+         properties.master_weights = None
+         properties.loss_scale = "dynamic"
++        properties.combine_grad = None
+         # properties.fused_optimizer = False
+         # properties.enable_ddp_interop = False
+         return properties # modified in place so this isn't really necessary
+@@ -205,8 +225,17 @@
+     cast_model_outputs=None,
+     num_losses=1,
+     verbosity=1,
++    dynamic_init_scale=2.**16,
++    scale_growth_factor=2.,
++    scale_backoff_factor=0.5,
++    scale_window=2000,
+     min_loss_scale=None,
+-    max_loss_scale=2.**24
++    max_loss_scale=2.**24,
++    combine_grad=None,
++    combine_ddp=None,
++    ddp_replica_count=4,
++    user_cast_preferred=None,
++    check_combined_tensors=None
+     ):
+     """
+     Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
+@@ -254,11 +283,32 @@
+             support multiple losses/backward passes, but use a single global loss scale
+             for all of them.
+         verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
++        dynamic_init_scale (float, optional, default=2.**16):  Initial dynamic loss scale factor.
++        scale_growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied
++            if no overflow occurs for ``scale_window`` consecutive iterations.
++            If dynamic loss scaling is not used, `scale_growth_factor` is ignored.
++        scale_backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied
++            if overflow occurs in an iteration. If dynamic loss scaling is not used, `scale_backoff_factor` is ignored.
++        scale_window (int, optional, default=2000):  Number of consecutive iterations without overflow
++            that must occur for the scale to be multiplied by ``scale_growth_factor``.
++            If dynamic loss scaling is not used, `scale_window` is ignored.
+         min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
+             loss scaling.  The default value of None means that no floor is imposed.
+             If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+         max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
+             dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
++        combine_grad (bool, optional, default=None): If True, make gradients fused for unscale.
++        combine_ddp (bool, optional, default=None): If True, use combined gradients for data exchange,
++            accelerate multi-card training, and functionally replace DistributedDataParallel.
++        ddp_replica_count (bool, optional, default=4): Set the number of replicas of combined gradients.
++            Theoretically, the more replicas, the higher the degree of parallelism, but the time-consuming
++            distribution operation itself will lead to a decrease in performance even though the degree
++            of parallelism is improved. Therefore, we limit and optimize the replica size for data exchange.
++            The final number of replicas is not necessarily exactly the same as the set number
++        user_cast_preferred (bool, optional, default=None): If True in O1, user cast registry is preferred
++            rather than fp16 white- / black-list, to avoid redundant dtype cast.
++        check_combined_tensors (bool, optional, default=None): If True, check if the combined grads and combined params
++            are valid during training
+ 
+     Returns:
+         Model(s) and optimizer(s) modified according to the ``opt_level``.
+@@ -306,6 +356,7 @@
+         https://github.com/NVIDIA/apex/issues
+     """
+     _amp_state.opt_properties = Properties()
++    # Here add a switch to open combine tensor
+     _amp_state.verbosity = verbosity
+ 
+     if not enabled:
+@@ -330,6 +381,10 @@
+         for k, v in _amp_state.opt_properties.options.items():
+             maybe_print("{:22} : {}".format(k, v), True)
+ 
++    _amp_state.dynamic_init_scale = dynamic_init_scale
++    _amp_state.scale_growth_factor = scale_growth_factor
++    _amp_state.scale_backoff_factor = scale_backoff_factor
++    _amp_state.scale_window = scale_window
+     _amp_state.min_loss_scale = min_loss_scale
+     _amp_state.max_loss_scale = max_loss_scale
+ 
+@@ -350,6 +405,16 @@
+         _amp_state.opt_properties.master_weights = master_weights
+     if loss_scale is not None:
+         _amp_state.opt_properties.loss_scale = loss_scale
++    if combine_grad is not None:
++        _amp_state.opt_properties.combine_grad = combine_grad
++    if combine_ddp is not None:
++        _amp_state.opt_properties.combine_ddp = combine_ddp
++    if ddp_replica_count is not None:
++        _amp_state.opt_properties.ddp_replica_count = ddp_replica_count
++    if user_cast_preferred is not None:
++        _amp_state.opt_properties.user_cast_preferred = user_cast_preferred
++    if check_combined_tensors is not None:
++        _amp_state.opt_properties.check_combined_tensors = check_combined_tensors
+ 
+     maybe_print("After processing overrides, optimization options are:", True)
+     for k, v in _amp_state.opt_properties.options.items():
+diff -Nur '--exclude=.git' apex/apex/amp/handle.py apex-develop/apex/amp/handle.py
+--- apex/apex/amp/handle.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/handle.py	2022-08-11 12:30:58.659866271 +0000
+@@ -1,3 +1,19 @@
++# Copyright (c) 2020, Huawei Technologies.
++# Copyright (c) 2019, NVIDIA CORPORATION.
++# All rights reserved.
++#
++# Licensed under the BSD 3-Clause License  (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://opensource.org/licenses/BSD-3-Clause
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ import contextlib
+ import warnings
+ import sys
+@@ -110,6 +126,9 @@
+                 if not optimizer._amp_stash.params_have_scaled_gradients:
+                     optimizer._prepare_amp_backward()
+ 
++    if loss_scaler.dynamic:
++        LossScaler.clear_npu_overflow_flag()
++
+     yield (loss.float())*loss_scale
+ 
+     if delay_unscale:
+@@ -142,8 +161,12 @@
+                                 # Maybe skip should delegate to a method owned by the optimizers themselves.
+                                 if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+                                     # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+-                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
+-                                        param.grad = None
++                                    if opt.accelerate or opt.is_npu_fused_optimizer:
++                                        if opt._amp_stash.main_fp32_from_fp16_grad_combine is not None:
++                                            opt._amp_stash.main_fp32_from_fp16_grad_combine.zero_()
++                                    else:
++                                        for param in opt._amp_stash.all_fp32_from_fp16_params:
++                                            param.grad = None
+                                 if hasattr(opt, "most_recent_scale"):
+                                     opt.most_recent_scale = 1.0
+                                     opt.scale_set_by_backward = False
 diff -Nur '--exclude=.git' apex/apex/amp/scaler.py apex-develop/apex/amp/scaler.py
---- apex/apex/amp/scaler.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/scaler.py	2022-05-13 10:49:00.200000000 +0800
+--- apex/apex/amp/scaler.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/scaler.py	2022-08-11 12:30:58.663866271 +0000
 @@ -1,7 +1,25 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -2100,8 +2135,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/scaler.py apex-develop/apex/amp/scaler.
  
          return should_skip
 diff -Nur '--exclude=.git' apex/apex/amp/utils.py apex-develop/apex/amp/utils.py
---- apex/apex/amp/utils.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/utils.py	2022-05-13 10:49:00.200000000 +0800
+--- apex/apex/amp/utils.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/utils.py	2022-08-11 12:30:58.663866271 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -2150,8 +2185,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/utils.py apex-develop/apex/amp/utils.py
                                     "cache[x]'s parent.  This is likely an error.")
          # During eval, it's possible to end up caching casted weights with
 diff -Nur '--exclude=.git' apex/apex/amp/wrap.py apex-develop/apex/amp/wrap.py
---- apex/apex/amp/wrap.py	2022-03-09 17:27:27.398309639 +0800
-+++ apex-develop/apex/amp/wrap.py	2022-05-13 10:49:00.200000000 +0800
+--- apex/apex/amp/wrap.py	2022-08-11 12:25:49.463879874 +0000
++++ apex-develop/apex/amp/wrap.py	2022-08-11 12:30:58.663866271 +0000
 @@ -249,7 +249,7 @@
  
          new_args = []
@@ -2162,8 +2197,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/wrap.py apex-develop/apex/amp/wrap.py
                  fp16_weight_buf = args[0].new_empty((num_params,),
                                                      dtype=torch.half)
 diff -Nur '--exclude=.git' apex/apex/optimizers/__init__.py apex-develop/apex/optimizers/__init__.py
---- apex/apex/optimizers/__init__.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/apex/optimizers/__init__.py	2022-05-13 10:49:00.208000000 +0800
+--- apex/apex/optimizers/__init__.py	2022-08-11 12:25:49.467879874 +0000
++++ apex-develop/apex/optimizers/__init__.py	2022-08-11 12:30:58.671866270 +0000
 @@ -2,4 +2,14 @@
  from .fused_adam import FusedAdam
  from .fused_novograd import FusedNovoGrad
@@ -2182,8 +2217,8 @@ diff -Nur '--exclude=.git' apex/apex/optimizers/__init__.py apex-develop/apex/op
 +from .npu_fused_rmsprop import NpuFusedRMSprop
 +from .npu_fused_rmsprop_tf import NpuFusedRMSpropTF
 diff -Nur '--exclude=.git' apex/csrc/flatten_unflatten.cpp apex-develop/csrc/flatten_unflatten.cpp
---- apex/csrc/flatten_unflatten.cpp	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/csrc/flatten_unflatten.cpp	2022-05-13 10:49:00.212000000 +0800
+--- apex/csrc/flatten_unflatten.cpp	2022-08-11 12:25:49.475879874 +0000
++++ apex-develop/csrc/flatten_unflatten.cpp	2022-08-11 12:30:58.679866270 +0000
 @@ -1,3 +1,18 @@
 +/*
 + * Copyright (c) 2020, Huawei Technologies.All rights reserved.
@@ -2212,27 +2247,9 @@ diff -Nur '--exclude=.git' apex/csrc/flatten_unflatten.cpp apex-develop/csrc/fla
    m.def("flatten", &flatten, "Flatten dense tensors");
    m.def("unflatten", &unflatten, "Unflatten dense tensors");
  }
-diff -Nur '--exclude=.git' apex/.gitignore apex-develop/.gitignore
---- apex/.gitignore	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/.gitignore	1970-01-01 08:00:00.000000000 +0800
-@@ -1,5 +0,0 @@
--apex.egg-info
--dist
--build
--docs/build
--*~
-\ No newline at end of file
-diff -Nur '--exclude=.git' apex/.gitmodules apex-develop/.gitmodules
---- apex/.gitmodules	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/.gitmodules	1970-01-01 08:00:00.000000000 +0800
-@@ -1,4 +0,0 @@
--[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
--	path = apex/contrib/csrc/multihead_attn/cutlass
--	url = https://github.com/NVIDIA/cutlass.git
--	branch = v1.2.0
 diff -Nur '--exclude=.git' apex/setup.py apex-develop/setup.py
---- apex/setup.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/setup.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/setup.py	2022-08-11 12:25:49.475879874 +0000
++++ apex-develop/setup.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,55 +1,88 @@
 -import torch
 -from torch.utils import cpp_extension
@@ -2726,8 +2743,8 @@ diff -Nur '--exclude=.git' apex/setup.py apex-develop/setup.py
                                      'csrc',
                                      'include',
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_add_param_group.py apex-develop/tests/L0/run_amp/test_add_param_group.py
---- apex/tests/L0/run_amp/test_add_param_group.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/tests/L0/run_amp/test_add_param_group.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_add_param_group.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_add_param_group.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -2816,8 +2833,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_add_param_group.py apex-de
                                        opt_level, how_to_zero, zero_before_add))
  
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_basic_casts.py apex-develop/tests/L0/run_amp/test_basic_casts.py
---- apex/tests/L0/run_amp/test_basic_casts.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/tests/L0/run_amp/test_basic_casts.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_basic_casts.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_basic_casts.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3019,8 +3036,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_basic_casts.py apex-develo
      # TODO: maybe more tests on disabled casting?
  
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_cache.py apex-develop/tests/L0/run_amp/test_cache.py
---- apex/tests/L0/run_amp/test_cache.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/tests/L0/run_amp/test_cache.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_cache.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_cache.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3154,8 +3171,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_cache.py apex-develop/test
  
  if __name__ == '__main__':
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_checkpointing.py apex-develop/tests/L0/run_amp/test_checkpointing.py
---- apex/tests/L0/run_amp/test_checkpointing.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L0/run_amp/test_checkpointing.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_checkpointing.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_checkpointing.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3197,8 +3214,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_checkpointing.py apex-deve
  
      def train_step(self, model, optimizer, data, loss_ids):
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_larc.py apex-develop/tests/L0/run_amp/test_larc.py
---- apex/tests/L0/run_amp/test_larc.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L0/run_amp/test_larc.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_larc.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_larc.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,5 +1,5 @@
  import unittest
 -
@@ -3241,8 +3258,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_larc.py apex-develop/tests
  
              optimizer.zero_grad()
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_promotion.py apex-develop/tests/L0/run_amp/test_promotion.py
---- apex/tests/L0/run_amp/test_promotion.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L0/run_amp/test_promotion.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_promotion.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_promotion.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3339,8 +3356,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_promotion.py apex-develop/
              xs.exp_()
  
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_rnn.py apex-develop/tests/L0/run_amp/test_rnn.py
---- apex/tests/L0/run_amp/test_rnn.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L0/run_amp/test_rnn.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/test_rnn.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/test_rnn.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3399,8 +3416,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_rnn.py apex-develop/tests/
              self.assertEqual(x.grad.dtype, x.dtype)
  
 diff -Nur '--exclude=.git' apex/tests/L0/run_amp/utils.py apex-develop/tests/L0/run_amp/utils.py
---- apex/tests/L0/run_amp/utils.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L0/run_amp/utils.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_amp/utils.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_amp/utils.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,7 +1,28 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3464,8 +3481,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/utils.py apex-develop/tests/L0/
 +    return npu_input1
 \ No newline at end of file
 diff -Nur '--exclude=.git' apex/tests/L0/run_test.py apex-develop/tests/L0/run_test.py
---- apex/tests/L0/run_test.py	2022-03-09 17:27:27.402309608 +0800
-+++ apex-develop/tests/L0/run_test.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L0/run_test.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L0/run_test.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,20 +1,72 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3549,8 +3566,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_test.py apex-develop/tests/L0/run_t
 -sys.exit(errcode)
 +    sys.exit(errcode)
 diff -Nur '--exclude=.git' apex/tests/L1/common/main_amp.py apex-develop/tests/L1/common/main_amp.py
---- apex/tests/L1/common/main_amp.py	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L1/common/main_amp.py	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L1/common/main_amp.py	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L1/common/main_amp.py	2022-08-11 12:30:58.683866270 +0000
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020, Huawei Technologies.
 +# Copyright (c) 2019, NVIDIA CORPORATION.
@@ -3832,8 +3849,8 @@ diff -Nur '--exclude=.git' apex/tests/L1/common/main_amp.py apex-develop/tests/L
            .format(top1=top1, top5=top5))
  
 diff -Nur '--exclude=.git' apex/tests/L1/cross_product/run.sh apex-develop/tests/L1/cross_product/run.sh
---- apex/tests/L1/cross_product/run.sh	2022-03-09 17:27:27.406309578 +0800
-+++ apex-develop/tests/L1/cross_product/run.sh	2022-05-13 10:49:00.216000000 +0800
+--- apex/tests/L1/cross_product/run.sh	2022-08-11 12:25:49.471879874 +0000
++++ apex-develop/tests/L1/cross_product/run.sh	2022-08-11 12:30:58.683866270 +0000
 @@ -3,4 +3,5 @@
  # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
  # DATADIR="/opt/home/apex/examples/imagenet/"
-- 
Gitee