From 6dc30ee5b4a4b68a3bd79e92d5e7e62bf74481be Mon Sep 17 00:00:00 2001 From: xiaxia3 Date: Thu, 11 Aug 2022 13:11:14 +0000 Subject: [PATCH] modify container_abcs to support pytorch v1.11.0 --- patch/npu.patch | 563 +++++++++++++++++++++++++----------------------- 1 file changed, 290 insertions(+), 273 deletions(-) diff --git a/patch/npu.patch b/patch/npu.patch index b20b3d4..653d07f 100644 --- a/patch/npu.patch +++ b/patch/npu.patch @@ -1,225 +1,41 @@ -diff -Nur '--exclude=.git' apex/apex/amp/amp.py apex-develop/apex/amp/amp.py ---- apex/apex/amp/amp.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/amp.py 2022-05-13 10:49:00.196000000 +0800 -@@ -65,7 +65,14 @@ - - - # Top-level function to insert _all_ the hooks. --def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False): -+def init( -+ enabled=True, -+ loss_scale="dynamic", -+ enable_caching=True, -+ verbose=False, -+ allow_banned=False, -+ user_cast_preferred=None): -+ - global _DECORATOR_HANDLE - - if not enabled: -@@ -76,7 +83,10 @@ - handle = AmpHandle(loss_scale, enable_caching, verbose) - - # 0) Force-{fp16, fp32} for user-annotated functions -+ _user_cast_registry = set() - for mod, fn, cast_fn in _USER_CAST_REGISTRY: -+ if user_cast_preferred: -+ _user_cast_registry.add((mod, fn)) - try_caching = (cast_fn == utils.maybe_half) - wrap.cached_cast(mod, fn, cast_fn, handle, - try_caching, verbose) -@@ -96,6 +106,8 @@ - for module, (list_name, cast_fn) in itertools.product(override_modules, - cast_table): - for fn in getattr(module, list_name): -+ if user_cast_preferred and (module.MODULE, fn) in _user_cast_registry: -+ continue - try_caching = (cast_fn == utils.maybe_half) - wrap.cached_cast(module.MODULE, fn, cast_fn, handle, - try_caching, verbose) -diff -Nur '--exclude=.git' apex/apex/amp/frontend.py apex-develop/apex/amp/frontend.py ---- apex/apex/amp/frontend.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/frontend.py 2022-05-13 10:49:00.196000000 +0800 -@@ -19,6 +19,11 @@ - "keep_batchnorm_fp32" : None, - "master_weights" : None, - "loss_scale" : 1.0, -+ "combine_grad": None, -+ "combine_ddp": None, -+ "ddp_replica_count": 4, -+ "check_combined_tensors": None, -+ "user_cast_preferred":None, - # Reserved for future functionality - # "fused_optimizer" : False, - # "enable_ddp_interop" : False, -@@ -91,6 +96,20 @@ - self.options[name] = value - else: - self.options[name] = float(value) -+ elif name == "combine_grad" or name == "check_combined_tensors": -+ if self.opt_level not in ["O1", "O2"] and value: -+ warn_or_err("Currently, combine_grad=True or check_combined_tensors=True should only be set " -+ "by selecting opt_level='O1' or opt_level='O2'.") -+ self.options[name] = value -+ elif name == "combine_ddp": -+ if not self.combine_grad: -+ warn_or_err("Combine_grad should be True when combine_ddp using.. \n") -+ self.options[name] = value -+ elif name == "user_cast_preferred": -+ if self.opt_level != "O1" and value: -+ warn_or_err("Currently, user_cast_preferred=True should only be set by " -+ "selecting opt_level='O1'.") -+ self.options[name] = value - else: - self.options[name] = value - else: -@@ -161,6 +180,7 @@ - properties.keep_batchnorm_fp32 = None - properties.master_weights = None - properties.loss_scale = "dynamic" -+ properties.combine_grad = None - # properties.fused_optimizer = False - # properties.enable_ddp_interop = False - return properties # modified in place so this isn't really necessary -@@ -205,8 +225,17 @@ - cast_model_outputs=None, - num_losses=1, - verbosity=1, -+ dynamic_init_scale=2.**16, -+ scale_growth_factor=2., -+ scale_backoff_factor=0.5, -+ scale_window=2000, - min_loss_scale=None, -- max_loss_scale=2.**24 -+ max_loss_scale=2.**24, -+ combine_grad=None, -+ combine_ddp=None, -+ ddp_replica_count=4, -+ user_cast_preferred=None, -+ check_combined_tensors=None - ): - """ - Initialize your models, optimizers, and the Torch tensor and functional namespace according to the -@@ -254,11 +283,32 @@ - support multiple losses/backward passes, but use a single global loss scale - for all of them. - verbosity (int, default=1): Set to 0 to suppress Amp-related output. -+ dynamic_init_scale (float, optional, default=2.**16): Initial dynamic loss scale factor. -+ scale_growth_factor (float, optional, default=2.0): Factor by which the scale is multiplied -+ if no overflow occurs for ``scale_window`` consecutive iterations. -+ If dynamic loss scaling is not used, `scale_growth_factor` is ignored. -+ scale_backoff_factor (float, optional, default=0.5): Factor by which the scale is multiplied -+ if overflow occurs in an iteration. If dynamic loss scaling is not used, `scale_backoff_factor` is ignored. -+ scale_window (int, optional, default=2000): Number of consecutive iterations without overflow -+ that must occur for the scale to be multiplied by ``scale_growth_factor``. -+ If dynamic loss scaling is not used, `scale_window` is ignored. - min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic - loss scaling. The default value of None means that no floor is imposed. - If dynamic loss scaling is not used, `min_loss_scale` is ignored. - max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by - dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored. -+ combine_grad (bool, optional, default=None): If True, make gradients fused for unscale. -+ combine_ddp (bool, optional, default=None): If True, use combined gradients for data exchange, -+ accelerate multi-card training, and functionally replace DistributedDataParallel. -+ ddp_replica_count (bool, optional, default=4): Set the number of replicas of combined gradients. -+ Theoretically, the more replicas, the higher the degree of parallelism, but the time-consuming -+ distribution operation itself will lead to a decrease in performance even though the degree -+ of parallelism is improved. Therefore, we limit and optimize the replica size for data exchange. -+ The final number of replicas is not necessarily exactly the same as the set number -+ user_cast_preferred (bool, optional, default=None): If True in O1, user cast registry is preferred -+ rather than fp16 white- / black-list, to avoid redundant dtype cast. -+ check_combined_tensors (bool, optional, default=None): If True, check if the combined grads and combined params -+ are valid during training - - Returns: - Model(s) and optimizer(s) modified according to the ``opt_level``. -@@ -306,6 +356,7 @@ - https://github.com/NVIDIA/apex/issues - """ - _amp_state.opt_properties = Properties() -+ # Here add a switch to open combine tensor - _amp_state.verbosity = verbosity - - if not enabled: -@@ -330,6 +381,10 @@ - for k, v in _amp_state.opt_properties.options.items(): - maybe_print("{:22} : {}".format(k, v), True) - -+ _amp_state.dynamic_init_scale = dynamic_init_scale -+ _amp_state.scale_growth_factor = scale_growth_factor -+ _amp_state.scale_backoff_factor = scale_backoff_factor -+ _amp_state.scale_window = scale_window - _amp_state.min_loss_scale = min_loss_scale - _amp_state.max_loss_scale = max_loss_scale - -@@ -350,6 +405,16 @@ - _amp_state.opt_properties.master_weights = master_weights - if loss_scale is not None: - _amp_state.opt_properties.loss_scale = loss_scale -+ if combine_grad is not None: -+ _amp_state.opt_properties.combine_grad = combine_grad -+ if combine_ddp is not None: -+ _amp_state.opt_properties.combine_ddp = combine_ddp -+ if ddp_replica_count is not None: -+ _amp_state.opt_properties.ddp_replica_count = ddp_replica_count -+ if user_cast_preferred is not None: -+ _amp_state.opt_properties.user_cast_preferred = user_cast_preferred -+ if check_combined_tensors is not None: -+ _amp_state.opt_properties.check_combined_tensors = check_combined_tensors - - maybe_print("After processing overrides, optimization options are:", True) - for k, v in _amp_state.opt_properties.options.items(): -diff -Nur '--exclude=.git' apex/apex/amp/handle.py apex-develop/apex/amp/handle.py ---- apex/apex/amp/handle.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/handle.py 2022-05-13 10:49:00.196000000 +0800 -@@ -1,3 +1,19 @@ -+# Copyright (c) 2020, Huawei Technologies. -+# Copyright (c) 2019, NVIDIA CORPORATION. -+# All rights reserved. -+# -+# Licensed under the BSD 3-Clause License (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# https://opensource.org/licenses/BSD-3-Clause -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+ - import contextlib - import warnings - import sys -@@ -110,6 +126,9 @@ - if not optimizer._amp_stash.params_have_scaled_gradients: - optimizer._prepare_amp_backward() +diff -Nur '--exclude=.git' apex/.gitignore apex-develop/.gitignore +--- apex/.gitignore 2022-08-11 12:25:49.507879872 +0000 ++++ apex-develop/.gitignore 1970-01-01 00:00:00.000000000 +0000 +@@ -1,5 +0,0 @@ +-apex.egg-info +-dist +-build +-docs/build +-*~ +\ No newline at end of file +diff -Nur '--exclude=.git' apex/.gitmodules apex-develop/.gitmodules +--- apex/.gitmodules 2022-08-11 12:25:49.507879872 +0000 ++++ apex-develop/.gitmodules 1970-01-01 00:00:00.000000000 +0000 +@@ -1,4 +0,0 @@ +-[submodule "apex/contrib/csrc/multihead_attn/cutlass"] +- path = apex/contrib/csrc/multihead_attn/cutlass +- url = https://github.com/NVIDIA/cutlass.git +- branch = v1.2.0 +diff -Nur '--exclude=.git' apex/apex/amp/_amp_state.py apex-develop/apex/amp/_amp_state.py +--- apex/apex/amp/_amp_state.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/_amp_state.py 2022-08-11 12:30:58.659866271 +0000 +@@ -8,10 +8,10 @@ + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + +-if TORCH_MAJOR == 0: +- import collections.abc as container_abcs +-else: ++if TORCH_MAJOR == 1 and TORCH_MINOR < 8: + from torch._six import container_abcs ++else: ++ import collections.abc as container_abcs -+ if loss_scaler.dynamic: -+ LossScaler.clear_npu_overflow_flag() -+ - yield (loss.float())*loss_scale - if delay_unscale: -@@ -142,8 +161,12 @@ - # Maybe skip should delegate to a method owned by the optimizers themselves. - if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"): - # Clear the master grads that wouldn't be zeroed by model.zero_grad() -- for param in opt._amp_stash.all_fp32_from_fp16_params: -- param.grad = None -+ if opt.accelerate or opt.is_npu_fused_optimizer: -+ if opt._amp_stash.main_fp32_from_fp16_grad_combine is not None: -+ opt._amp_stash.main_fp32_from_fp16_grad_combine.zero_() -+ else: -+ for param in opt._amp_stash.all_fp32_from_fp16_params: -+ param.grad = None - if hasattr(opt, "most_recent_scale"): - opt.most_recent_scale = 1.0 - opt.scale_set_by_backward = False + class AmpState(object): diff -Nur '--exclude=.git' apex/apex/amp/_initialize.py apex-develop/apex/amp/_initialize.py ---- apex/apex/amp/_initialize.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/_initialize.py 2022-05-13 10:49:00.196000000 +0800 +--- apex/apex/amp/_initialize.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/_initialize.py 2022-08-11 12:30:58.659866271 +0000 @@ -1,9 +1,26 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -337,8 +153,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/_initialize.py apex-develop/apex/amp/_i if models_was_list: return models, optimizers diff -Nur '--exclude=.git' apex/apex/amp/_process_optimizer.py apex-develop/apex/amp/_process_optimizer.py ---- apex/apex/amp/_process_optimizer.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/_process_optimizer.py 2022-05-13 10:49:00.196000000 +0800 +--- apex/apex/amp/_process_optimizer.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/_process_optimizer.py 2022-08-11 12:30:58.659866271 +0000 @@ -1,9 +1,77 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -1795,9 +1611,228 @@ diff -Nur '--exclude=.git' apex/apex/amp/_process_optimizer.py apex-develop/apex stash.all_fp32_params.append(param) stash.all_fp32_grad_stash.append(None) else: +diff -Nur '--exclude=.git' apex/apex/amp/amp.py apex-develop/apex/amp/amp.py +--- apex/apex/amp/amp.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/amp.py 2022-08-11 12:30:58.659866271 +0000 +@@ -65,7 +65,14 @@ + + + # Top-level function to insert _all_ the hooks. +-def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False): ++def init( ++ enabled=True, ++ loss_scale="dynamic", ++ enable_caching=True, ++ verbose=False, ++ allow_banned=False, ++ user_cast_preferred=None): ++ + global _DECORATOR_HANDLE + + if not enabled: +@@ -76,7 +83,10 @@ + handle = AmpHandle(loss_scale, enable_caching, verbose) + + # 0) Force-{fp16, fp32} for user-annotated functions ++ _user_cast_registry = set() + for mod, fn, cast_fn in _USER_CAST_REGISTRY: ++ if user_cast_preferred: ++ _user_cast_registry.add((mod, fn)) + try_caching = (cast_fn == utils.maybe_half) + wrap.cached_cast(mod, fn, cast_fn, handle, + try_caching, verbose) +@@ -96,6 +106,8 @@ + for module, (list_name, cast_fn) in itertools.product(override_modules, + cast_table): + for fn in getattr(module, list_name): ++ if user_cast_preferred and (module.MODULE, fn) in _user_cast_registry: ++ continue + try_caching = (cast_fn == utils.maybe_half) + wrap.cached_cast(module.MODULE, fn, cast_fn, handle, + try_caching, verbose) +diff -Nur '--exclude=.git' apex/apex/amp/frontend.py apex-develop/apex/amp/frontend.py +--- apex/apex/amp/frontend.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/frontend.py 2022-08-11 12:30:58.659866271 +0000 +@@ -19,6 +19,11 @@ + "keep_batchnorm_fp32" : None, + "master_weights" : None, + "loss_scale" : 1.0, ++ "combine_grad": None, ++ "combine_ddp": None, ++ "ddp_replica_count": 4, ++ "check_combined_tensors": None, ++ "user_cast_preferred":None, + # Reserved for future functionality + # "fused_optimizer" : False, + # "enable_ddp_interop" : False, +@@ -91,6 +96,20 @@ + self.options[name] = value + else: + self.options[name] = float(value) ++ elif name == "combine_grad" or name == "check_combined_tensors": ++ if self.opt_level not in ["O1", "O2"] and value: ++ warn_or_err("Currently, combine_grad=True or check_combined_tensors=True should only be set " ++ "by selecting opt_level='O1' or opt_level='O2'.") ++ self.options[name] = value ++ elif name == "combine_ddp": ++ if not self.combine_grad: ++ warn_or_err("Combine_grad should be True when combine_ddp using.. \n") ++ self.options[name] = value ++ elif name == "user_cast_preferred": ++ if self.opt_level != "O1" and value: ++ warn_or_err("Currently, user_cast_preferred=True should only be set by " ++ "selecting opt_level='O1'.") ++ self.options[name] = value + else: + self.options[name] = value + else: +@@ -161,6 +180,7 @@ + properties.keep_batchnorm_fp32 = None + properties.master_weights = None + properties.loss_scale = "dynamic" ++ properties.combine_grad = None + # properties.fused_optimizer = False + # properties.enable_ddp_interop = False + return properties # modified in place so this isn't really necessary +@@ -205,8 +225,17 @@ + cast_model_outputs=None, + num_losses=1, + verbosity=1, ++ dynamic_init_scale=2.**16, ++ scale_growth_factor=2., ++ scale_backoff_factor=0.5, ++ scale_window=2000, + min_loss_scale=None, +- max_loss_scale=2.**24 ++ max_loss_scale=2.**24, ++ combine_grad=None, ++ combine_ddp=None, ++ ddp_replica_count=4, ++ user_cast_preferred=None, ++ check_combined_tensors=None + ): + """ + Initialize your models, optimizers, and the Torch tensor and functional namespace according to the +@@ -254,11 +283,32 @@ + support multiple losses/backward passes, but use a single global loss scale + for all of them. + verbosity (int, default=1): Set to 0 to suppress Amp-related output. ++ dynamic_init_scale (float, optional, default=2.**16): Initial dynamic loss scale factor. ++ scale_growth_factor (float, optional, default=2.0): Factor by which the scale is multiplied ++ if no overflow occurs for ``scale_window`` consecutive iterations. ++ If dynamic loss scaling is not used, `scale_growth_factor` is ignored. ++ scale_backoff_factor (float, optional, default=0.5): Factor by which the scale is multiplied ++ if overflow occurs in an iteration. If dynamic loss scaling is not used, `scale_backoff_factor` is ignored. ++ scale_window (int, optional, default=2000): Number of consecutive iterations without overflow ++ that must occur for the scale to be multiplied by ``scale_growth_factor``. ++ If dynamic loss scaling is not used, `scale_window` is ignored. + min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic + loss scaling. The default value of None means that no floor is imposed. + If dynamic loss scaling is not used, `min_loss_scale` is ignored. + max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by + dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored. ++ combine_grad (bool, optional, default=None): If True, make gradients fused for unscale. ++ combine_ddp (bool, optional, default=None): If True, use combined gradients for data exchange, ++ accelerate multi-card training, and functionally replace DistributedDataParallel. ++ ddp_replica_count (bool, optional, default=4): Set the number of replicas of combined gradients. ++ Theoretically, the more replicas, the higher the degree of parallelism, but the time-consuming ++ distribution operation itself will lead to a decrease in performance even though the degree ++ of parallelism is improved. Therefore, we limit and optimize the replica size for data exchange. ++ The final number of replicas is not necessarily exactly the same as the set number ++ user_cast_preferred (bool, optional, default=None): If True in O1, user cast registry is preferred ++ rather than fp16 white- / black-list, to avoid redundant dtype cast. ++ check_combined_tensors (bool, optional, default=None): If True, check if the combined grads and combined params ++ are valid during training + + Returns: + Model(s) and optimizer(s) modified according to the ``opt_level``. +@@ -306,6 +356,7 @@ + https://github.com/NVIDIA/apex/issues + """ + _amp_state.opt_properties = Properties() ++ # Here add a switch to open combine tensor + _amp_state.verbosity = verbosity + + if not enabled: +@@ -330,6 +381,10 @@ + for k, v in _amp_state.opt_properties.options.items(): + maybe_print("{:22} : {}".format(k, v), True) + ++ _amp_state.dynamic_init_scale = dynamic_init_scale ++ _amp_state.scale_growth_factor = scale_growth_factor ++ _amp_state.scale_backoff_factor = scale_backoff_factor ++ _amp_state.scale_window = scale_window + _amp_state.min_loss_scale = min_loss_scale + _amp_state.max_loss_scale = max_loss_scale + +@@ -350,6 +405,16 @@ + _amp_state.opt_properties.master_weights = master_weights + if loss_scale is not None: + _amp_state.opt_properties.loss_scale = loss_scale ++ if combine_grad is not None: ++ _amp_state.opt_properties.combine_grad = combine_grad ++ if combine_ddp is not None: ++ _amp_state.opt_properties.combine_ddp = combine_ddp ++ if ddp_replica_count is not None: ++ _amp_state.opt_properties.ddp_replica_count = ddp_replica_count ++ if user_cast_preferred is not None: ++ _amp_state.opt_properties.user_cast_preferred = user_cast_preferred ++ if check_combined_tensors is not None: ++ _amp_state.opt_properties.check_combined_tensors = check_combined_tensors + + maybe_print("After processing overrides, optimization options are:", True) + for k, v in _amp_state.opt_properties.options.items(): +diff -Nur '--exclude=.git' apex/apex/amp/handle.py apex-develop/apex/amp/handle.py +--- apex/apex/amp/handle.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/handle.py 2022-08-11 12:30:58.659866271 +0000 +@@ -1,3 +1,19 @@ ++# Copyright (c) 2020, Huawei Technologies. ++# Copyright (c) 2019, NVIDIA CORPORATION. ++# All rights reserved. ++# ++# Licensed under the BSD 3-Clause License (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# https://opensource.org/licenses/BSD-3-Clause ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ + import contextlib + import warnings + import sys +@@ -110,6 +126,9 @@ + if not optimizer._amp_stash.params_have_scaled_gradients: + optimizer._prepare_amp_backward() + ++ if loss_scaler.dynamic: ++ LossScaler.clear_npu_overflow_flag() ++ + yield (loss.float())*loss_scale + + if delay_unscale: +@@ -142,8 +161,12 @@ + # Maybe skip should delegate to a method owned by the optimizers themselves. + if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"): + # Clear the master grads that wouldn't be zeroed by model.zero_grad() +- for param in opt._amp_stash.all_fp32_from_fp16_params: +- param.grad = None ++ if opt.accelerate or opt.is_npu_fused_optimizer: ++ if opt._amp_stash.main_fp32_from_fp16_grad_combine is not None: ++ opt._amp_stash.main_fp32_from_fp16_grad_combine.zero_() ++ else: ++ for param in opt._amp_stash.all_fp32_from_fp16_params: ++ param.grad = None + if hasattr(opt, "most_recent_scale"): + opt.most_recent_scale = 1.0 + opt.scale_set_by_backward = False diff -Nur '--exclude=.git' apex/apex/amp/scaler.py apex-develop/apex/amp/scaler.py ---- apex/apex/amp/scaler.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/scaler.py 2022-05-13 10:49:00.200000000 +0800 +--- apex/apex/amp/scaler.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/scaler.py 2022-08-11 12:30:58.663866271 +0000 @@ -1,7 +1,25 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -2100,8 +2135,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/scaler.py apex-develop/apex/amp/scaler. return should_skip diff -Nur '--exclude=.git' apex/apex/amp/utils.py apex-develop/apex/amp/utils.py ---- apex/apex/amp/utils.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/utils.py 2022-05-13 10:49:00.200000000 +0800 +--- apex/apex/amp/utils.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/utils.py 2022-08-11 12:30:58.663866271 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -2150,8 +2185,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/utils.py apex-develop/apex/amp/utils.py "cache[x]'s parent. This is likely an error.") # During eval, it's possible to end up caching casted weights with diff -Nur '--exclude=.git' apex/apex/amp/wrap.py apex-develop/apex/amp/wrap.py ---- apex/apex/amp/wrap.py 2022-03-09 17:27:27.398309639 +0800 -+++ apex-develop/apex/amp/wrap.py 2022-05-13 10:49:00.200000000 +0800 +--- apex/apex/amp/wrap.py 2022-08-11 12:25:49.463879874 +0000 ++++ apex-develop/apex/amp/wrap.py 2022-08-11 12:30:58.663866271 +0000 @@ -249,7 +249,7 @@ new_args = [] @@ -2162,8 +2197,8 @@ diff -Nur '--exclude=.git' apex/apex/amp/wrap.py apex-develop/apex/amp/wrap.py fp16_weight_buf = args[0].new_empty((num_params,), dtype=torch.half) diff -Nur '--exclude=.git' apex/apex/optimizers/__init__.py apex-develop/apex/optimizers/__init__.py ---- apex/apex/optimizers/__init__.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/apex/optimizers/__init__.py 2022-05-13 10:49:00.208000000 +0800 +--- apex/apex/optimizers/__init__.py 2022-08-11 12:25:49.467879874 +0000 ++++ apex-develop/apex/optimizers/__init__.py 2022-08-11 12:30:58.671866270 +0000 @@ -2,4 +2,14 @@ from .fused_adam import FusedAdam from .fused_novograd import FusedNovoGrad @@ -2182,8 +2217,8 @@ diff -Nur '--exclude=.git' apex/apex/optimizers/__init__.py apex-develop/apex/op +from .npu_fused_rmsprop import NpuFusedRMSprop +from .npu_fused_rmsprop_tf import NpuFusedRMSpropTF diff -Nur '--exclude=.git' apex/csrc/flatten_unflatten.cpp apex-develop/csrc/flatten_unflatten.cpp ---- apex/csrc/flatten_unflatten.cpp 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/csrc/flatten_unflatten.cpp 2022-05-13 10:49:00.212000000 +0800 +--- apex/csrc/flatten_unflatten.cpp 2022-08-11 12:25:49.475879874 +0000 ++++ apex-develop/csrc/flatten_unflatten.cpp 2022-08-11 12:30:58.679866270 +0000 @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2020, Huawei Technologies.All rights reserved. @@ -2212,27 +2247,9 @@ diff -Nur '--exclude=.git' apex/csrc/flatten_unflatten.cpp apex-develop/csrc/fla m.def("flatten", &flatten, "Flatten dense tensors"); m.def("unflatten", &unflatten, "Unflatten dense tensors"); } -diff -Nur '--exclude=.git' apex/.gitignore apex-develop/.gitignore ---- apex/.gitignore 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/.gitignore 1970-01-01 08:00:00.000000000 +0800 -@@ -1,5 +0,0 @@ --apex.egg-info --dist --build --docs/build --*~ -\ No newline at end of file -diff -Nur '--exclude=.git' apex/.gitmodules apex-develop/.gitmodules ---- apex/.gitmodules 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/.gitmodules 1970-01-01 08:00:00.000000000 +0800 -@@ -1,4 +0,0 @@ --[submodule "apex/contrib/csrc/multihead_attn/cutlass"] -- path = apex/contrib/csrc/multihead_attn/cutlass -- url = https://github.com/NVIDIA/cutlass.git -- branch = v1.2.0 diff -Nur '--exclude=.git' apex/setup.py apex-develop/setup.py ---- apex/setup.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/setup.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/setup.py 2022-08-11 12:25:49.475879874 +0000 ++++ apex-develop/setup.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,55 +1,88 @@ -import torch -from torch.utils import cpp_extension @@ -2726,8 +2743,8 @@ diff -Nur '--exclude=.git' apex/setup.py apex-develop/setup.py 'csrc', 'include', diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_add_param_group.py apex-develop/tests/L0/run_amp/test_add_param_group.py ---- apex/tests/L0/run_amp/test_add_param_group.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/tests/L0/run_amp/test_add_param_group.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_add_param_group.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_add_param_group.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -2816,8 +2833,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_add_param_group.py apex-de opt_level, how_to_zero, zero_before_add)) diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_basic_casts.py apex-develop/tests/L0/run_amp/test_basic_casts.py ---- apex/tests/L0/run_amp/test_basic_casts.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/tests/L0/run_amp/test_basic_casts.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_basic_casts.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_basic_casts.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3019,8 +3036,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_basic_casts.py apex-develo # TODO: maybe more tests on disabled casting? diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_cache.py apex-develop/tests/L0/run_amp/test_cache.py ---- apex/tests/L0/run_amp/test_cache.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/tests/L0/run_amp/test_cache.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_cache.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_cache.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3154,8 +3171,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_cache.py apex-develop/test if __name__ == '__main__': diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_checkpointing.py apex-develop/tests/L0/run_amp/test_checkpointing.py ---- apex/tests/L0/run_amp/test_checkpointing.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L0/run_amp/test_checkpointing.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_checkpointing.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_checkpointing.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3197,8 +3214,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_checkpointing.py apex-deve def train_step(self, model, optimizer, data, loss_ids): diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_larc.py apex-develop/tests/L0/run_amp/test_larc.py ---- apex/tests/L0/run_amp/test_larc.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L0/run_amp/test_larc.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_larc.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_larc.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,5 +1,5 @@ import unittest - @@ -3241,8 +3258,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_larc.py apex-develop/tests optimizer.zero_grad() diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_promotion.py apex-develop/tests/L0/run_amp/test_promotion.py ---- apex/tests/L0/run_amp/test_promotion.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L0/run_amp/test_promotion.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_promotion.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_promotion.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3339,8 +3356,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_promotion.py apex-develop/ xs.exp_() diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_rnn.py apex-develop/tests/L0/run_amp/test_rnn.py ---- apex/tests/L0/run_amp/test_rnn.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L0/run_amp/test_rnn.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/test_rnn.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/test_rnn.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3399,8 +3416,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/test_rnn.py apex-develop/tests/ self.assertEqual(x.grad.dtype, x.dtype) diff -Nur '--exclude=.git' apex/tests/L0/run_amp/utils.py apex-develop/tests/L0/run_amp/utils.py ---- apex/tests/L0/run_amp/utils.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L0/run_amp/utils.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_amp/utils.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_amp/utils.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,7 +1,28 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3464,8 +3481,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_amp/utils.py apex-develop/tests/L0/ + return npu_input1 \ No newline at end of file diff -Nur '--exclude=.git' apex/tests/L0/run_test.py apex-develop/tests/L0/run_test.py ---- apex/tests/L0/run_test.py 2022-03-09 17:27:27.402309608 +0800 -+++ apex-develop/tests/L0/run_test.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L0/run_test.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L0/run_test.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,20 +1,72 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3549,8 +3566,8 @@ diff -Nur '--exclude=.git' apex/tests/L0/run_test.py apex-develop/tests/L0/run_t -sys.exit(errcode) + sys.exit(errcode) diff -Nur '--exclude=.git' apex/tests/L1/common/main_amp.py apex-develop/tests/L1/common/main_amp.py ---- apex/tests/L1/common/main_amp.py 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L1/common/main_amp.py 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L1/common/main_amp.py 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L1/common/main_amp.py 2022-08-11 12:30:58.683866270 +0000 @@ -1,3 +1,19 @@ +# Copyright (c) 2020, Huawei Technologies. +# Copyright (c) 2019, NVIDIA CORPORATION. @@ -3832,8 +3849,8 @@ diff -Nur '--exclude=.git' apex/tests/L1/common/main_amp.py apex-develop/tests/L .format(top1=top1, top5=top5)) diff -Nur '--exclude=.git' apex/tests/L1/cross_product/run.sh apex-develop/tests/L1/cross_product/run.sh ---- apex/tests/L1/cross_product/run.sh 2022-03-09 17:27:27.406309578 +0800 -+++ apex-develop/tests/L1/cross_product/run.sh 2022-05-13 10:49:00.216000000 +0800 +--- apex/tests/L1/cross_product/run.sh 2022-08-11 12:25:49.471879874 +0000 ++++ apex-develop/tests/L1/cross_product/run.sh 2022-08-11 12:30:58.683866270 +0000 @@ -3,4 +3,5 @@ # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/" # DATADIR="/opt/home/apex/examples/imagenet/" -- Gitee