From 187d98c6a9e44fdd0889659d24d71cb3b588a4e0 Mon Sep 17 00:00:00 2001
From: Zhao Hang <wb-zh951434@alibaba-inc.com>
Date: Thu, 22 Dec 2022 17:51:13 +0800
Subject: [PATCH 1/6] update to gcc-toolset-12-gcc-12.1.1-3.2.el8

Signed-off-by: Zhao Hang <wb-zh951434@alibaba-inc.com>
---
 ...all-to-enable-AMX-for-latest-kernels.patch |  77 ---
 ...VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch |  83 ---
 ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 -----
 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ------------------
 ...-small-loop-unrolling-in-backend-PR-.patch | 231 ---------
 download                                      |   2 +-
 gcc.spec                                      |  96 ++--
 gcc12-libtsan-s390x.patch                     |  17 +
 gcc12-pr105551.patch                          |  28 +
 gcc12-pr105991.patch                          |  89 ++++
 10 files changed, 196 insertions(+), 1031 deletions(-)
 delete mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 delete mode 100644 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
 delete mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 delete mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch
 delete mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 create mode 100644 gcc12-libtsan-s390x.patch
 create mode 100644 gcc12-pr105551.patch
 create mode 100644 gcc12-pr105991.patch

diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
deleted file mode 100644
index 94625b5..0000000
--- a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
+++ /dev/null
@@ -1,77 +0,0 @@
-From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001
-From: Haochen Jiang <haochen.jiang@intel.com>
-Date: Thu, 16 Jun 2022 00:15:53 -0700
-Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
-	New function to check if AMX is usable and enable AMX.
-	(main): Run test if AMX is usable.
----
- gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++
- 1 file changed, 30 insertions(+)
-
-diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
-index 434b0e59703..6fff5ff4631 100644
---- a/gcc/testsuite/gcc.target/i386/amx-check.h
-+++ b/gcc/testsuite/gcc.target/i386/amx-check.h
-@@ -4,11 +4,24 @@
- #include <stdlib.h>
- #include <string.h>
- #include <stdint.h>
-+#include <unistd.h>
-+#ifdef __linux__
-+#include <sys/syscall.h>
-+#endif
- #ifdef DEBUG
- #include <stdio.h>
- #endif
- #include "cpuid.h"
- 
-+#define XFEATURE_XTILECFG	17
-+#define XFEATURE_XTILEDATA	18
-+#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
-+#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
-+#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-+
-+#define ARCH_GET_XCOMP_PERM	0x1022
-+#define ARCH_REQ_XCOMP_PERM	0x1023
-+
- /* TODO: The tmm emulation is temporary for current
-    AMX implementation with no tmm regclass, should
-    be changed in the future. */
-@@ -44,6 +57,20 @@ typedef struct __tile
- /* Stride (colum width in byte) used for tileload/store */
- #define _STRIDE 64
- 
-+#ifdef __linux__
-+/* We need syscall to use amx functions */
-+int request_perm_xtile_data()
-+{
-+  unsigned long bitmask;
-+
-+  if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) ||
-+      syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask))
-+    return 0;
-+
-+  return (bitmask & XFEATURE_MASK_XTILE) != 0;
-+}
-+#endif
-+
- /* Initialize tile config by setting all tmm size to 16x64 */
- void init_tile_config (__tilecfg_u *dst)
- {
-@@ -185,6 +212,9 @@ main ()
- #endif
- #ifdef AMX_BF16
-       && __builtin_cpu_supports ("amx-bf16")
-+#endif
-+#ifdef __linux__
-+      && request_perm_xtile_data ()
- #endif
-       )
-     {
--- 
-2.18.2
-
diff --git a/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch b/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
deleted file mode 100644
index 42cabc2..0000000
--- a/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From 11c72f20d4d7ba1862a257cef05dc3a5e84a276d Mon Sep 17 00:00:00 2001
-From: "Cui,Lili" <lili.cui@intel.com>
-Date: Thu, 29 Sep 2022 14:28:06 +0800
-Subject: [PATCH] Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
-
-gcc/ChangeLog:
-
-	* config/i386/driver-i386.cc (host_detect_local_cpu):
-	Move sapphirerapids out of AVX512_VP2INTERSECT.
-	* config/i386/i386.h: Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
-	* doc/invoke.texi: Remove AVX512_VP2INTERSECT from SAPPHIRERAPIDS
----
- gcc/config/i386/driver-i386.cc | 13 +++++--------
- gcc/config/i386/i386.h         |  7 +++----
- gcc/doc/invoke.texi            |  8 ++++----
- 3 files changed, 12 insertions(+), 16 deletions(-)
-
-diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
-index 3c702fdca33..ef567045c67 100644
---- a/gcc/config/i386/driver-i386.cc
-+++ b/gcc/config/i386/driver-i386.cc
-@@ -589,15 +589,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
- 	      /* This is unknown family 0x6 CPU.  */
- 	      if (has_feature (FEATURE_AVX))
- 		{
-+		  /* Assume Tiger Lake */
- 		  if (has_feature (FEATURE_AVX512VP2INTERSECT))
--		    {
--		      if (has_feature (FEATURE_TSXLDTRK))
--			/* Assume Sapphire Rapids.  */
--			cpu = "sapphirerapids";
--		      else
--			/* Assume Tiger Lake */
--			cpu = "tigerlake";
--		    }
-+		    cpu = "tigerlake";
-+		  /* Assume Sapphire Rapids.  */
-+		  else if (has_feature (FEATURE_TSXLDTRK))
-+		    cpu = "sapphirerapids";
- 		  /* Assume Cooper Lake */
- 		  else if (has_feature (FEATURE_AVX512BF16))
- 		    cpu = "cooperlake";
-diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
-index 900a3bc3673..372a2cff8fe 100644
---- a/gcc/config/i386/i386.h
-+++ b/gcc/config/i386/i386.h
-@@ -2326,10 +2326,9 @@ constexpr wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT
- constexpr wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI
-   | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT | PTA_KL | PTA_WIDEKL;
- constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_ICELAKE_SERVER | PTA_MOVDIRI
--  | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE
--  | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE
--  | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16
--  | PTA_AVX512BF16;
-+  | PTA_MOVDIR64B | PTA_ENQCMD | PTA_CLDEMOTE | PTA_PTWRITE | PTA_WAITPKG
-+  | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE | PTA_AMX_INT8 | PTA_AMX_BF16
-+  | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16 | PTA_AVX512BF16;
- constexpr wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF
-   | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD | PTA_PREFETCHWT1;
- constexpr wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
-diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
-index 271c8bb8468..a9ecc4426a4 100644
---- a/gcc/doc/invoke.texi
-+++ b/gcc/doc/invoke.texi
-@@ -32057,11 +32057,11 @@ Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
- SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE,
- RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
- AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
--AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2
-+AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
- VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
--MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
--SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16
--and AVX512BF16 instruction set support.
-+MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK,
-+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16 and AVX512BF16
-+instruction set support.
- 
- @item alderlake
- Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3,
--- 
-2.18.2
-
diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
deleted file mode 100644
index 3e70f0c..0000000
--- a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+++ /dev/null
@@ -1,123 +0,0 @@
-From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001
-From: "Cui,Lili" <lili.cui@intel.com>
-Date: Tue, 1 Nov 2022 09:16:49 +0800
-Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint.
-
-We set up INLINE_HINT_known_hot hint only when we have profile feedback,
-now add function attribute judgement for it, when both caller and callee
-have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
-for it.
-
-With this patch applied,
-ADL Multi-copy:    538.imagic_r  16.7%
-ICX Multi-copy:    538.imagic_r  15.2%
-CLX Multi-copy:    538.imagic_r  12.7%
-Znver3 Multi-copy: 538.imagic_r  10.6%
-Arm Multi-copy:    538.imagic_r  13.4%
-
-gcc/ChangeLog
-
-	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
-	judgement for INLINE_HINT_known_hot hint.
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.dg/ipa/inlinehint-6.c: New test.
----
- gcc/ipa-inline-analysis.cc              | 13 ++++---
- gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++
- 2 files changed, 56 insertions(+), 4 deletions(-)
- create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-
-diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
-index 1ca685d1b0e..7bd29c36590 100644
---- a/gcc/ipa-inline-analysis.cc
-+++ b/gcc/ipa-inline-analysis.cc
-@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "ipa-utils.h"
- #include "cfgexpand.h"
- #include "gimplify.h"
-+#include "attribs.h"
- 
- /* Cached node/edge growths.  */
- fast_call_summary<edge_growth_cache_entry *, va_heap> *edge_growth_cache = NULL;
-@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
-       hints = estimates.hints;
-     }
- 
--  /* When we have profile feedback, we can quite safely identify hot
--     edges and for those we disable size limits.  Don't do that when
--     probability that caller will call the callee is low however, since it
-+  /* When we have profile feedback or function attribute, we can quite safely
-+     identify hot edges and for those we disable size limits.  Don't do that
-+     when probability that caller will call the callee is low however, since it
-      may hurt optimization of the caller's hot path.  */
--  if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
-+  if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
-       && (edge->count.ipa ().apply_scale (2, 1)
- 	  > (edge->caller->inlined_to
- 	     ? edge->caller->inlined_to->count.ipa ()
- 	     : edge->caller->count.ipa ())))
-+      || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl))
-+	  != NULL
-+	 && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl))
-+	  != NULL))
-     hints |= INLINE_HINT_known_hot;
- 
-   gcc_checking_assert (size >= 0);
-diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-new file mode 100644
-index 00000000000..1f3be641c6d
---- /dev/null
-+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-@@ -0,0 +1,47 @@
-+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
-+/* { dg-add-options bind_pic_locally } */
-+
-+#define size_t long long int
-+
-+struct A
-+{
-+  size_t f1, f2, f3, f4;
-+};
-+struct C
-+{
-+  struct A a;
-+  size_t b;
-+};
-+struct C x;
-+
-+__attribute__((hot)) struct C callee (struct A *a, struct C *c)
-+{
-+  c->a=(*a);
-+
-+  if((c->b + 7) & 17)
-+   {
-+      c->a.f1 = c->a.f2 + c->a.f1;
-+      c->a.f2 = c->a.f3 - c->a.f2;
-+      c->a.f3 = c->a.f2 + c->a.f3;
-+      c->a.f4 = c->a.f2 - c->a.f4;
-+      c->b = c->a.f2;
-+
-+    }
-+  return *c;
-+}
-+
-+__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
-+{
-+  struct A a;
-+  a.f1 = 1 + d;
-+  a.f2 = e;
-+  a.f3 = 12 + f;
-+  a.f4 = 68 + g;
-+  if (c->b > 0)
-+    return callee (&a, c);
-+  else
-+    return *c;
-+}
-+
-+/* { dg-final { scan-ipa-dump "known_hot"  "inline"  } } */
-+
--- 
-2.18.2
-
diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch
deleted file mode 100644
index b16171b..0000000
--- a/0026-Enable-small-loop-unrolling-for-O2.patch
+++ /dev/null
@@ -1,481 +0,0 @@
-From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001
-From: Hongyu Wang <hongyu.wang@intel.com>
-Date: Thu, 8 Sep 2022 16:52:02 +0800
-Subject: [PATCH 4/5] Enable small loop unrolling for O2
-
-Modern processors has multiple way instruction decoders
-For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
-instructions (usually has 3 uops with a cmp/jmp pair that can be
-macro-fused), the decoder would have 2 uops bubble for each iteration
-and the pipeline could not be fully utilized.
-
-Therefore, this patch enables loop unrolling for small size loop at O2
-to fullfill the decoder as much as possible. It turns on rtl loop
-unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
-In x86 backend the default behavior is to unroll small loops with less
-than 4 insns by 1 time.
-
-This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
-0.9% codesize increment. For other benchmarks the variants are minor
-and overall codesize increased by 0.2%.
-
-The kernel image size increased by 0.06%, and no impact on eembc.
-
-gcc/ChangeLog:
-
-	* common/config/i386/i386-common.cc (ix86_optimization_table):
-	Enable small loop unroll at O2 by default.
-	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
-	factor if -munroll-only-small-loops enabled and -funroll-loops/
-	-funroll-all-loops are disabled.
-	* config/i386/i386.h (struct processor_costs): Add 2 field
-	small_unroll_ninsns and small_unroll_factor.
-	* config/i386/i386.opt: Add -munroll-only-small-loops.
-	* doc/invoke.texi: Document -munroll-only-small-loops.
-	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
-	loop unrolling for -O2-speed and above if target hook
-	loop_unroll_adjust exists.
-	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
-	when target hook loop_unroll_adjust exists.
-	* config/i386/x86-tune-costs.h: Update all processor costs
-	with small_unroll_ninsns = 4 and small_unroll_factor = 2.
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.dg/guality/loop-1.c: Add additional option
-	-mno-unroll-only-small-loops.
-	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
-	* gcc.target/i386/pr93002.c: Likewise.
----
- gcc/common/config/i386/i386-common.cc   |  1 +
- gcc/config/i386/i386.cc                 | 18 ++++++++
- gcc/config/i386/i386.h                  |  5 +++
- gcc/config/i386/i386.opt                |  4 ++
- gcc/config/i386/x86-tune-costs.h        | 56 +++++++++++++++++++++++++
- gcc/doc/invoke.texi                     | 11 ++++-
- gcc/loop-init.cc                        | 10 +++--
- gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
- gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
- gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
- 10 files changed, 105 insertions(+), 6 deletions(-)
-
-diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
-index 07fdd045f30..e1c1fb07d8a 100644
---- a/gcc/common/config/i386/i386-common.cc
-+++ b/gcc/common/config/i386/i386-common.cc
-@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
-     /* The STC algorithm produces the smallest code at -Os, for x86.  */
-     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
-       REORDER_BLOCKS_ALGORITHM_STC },
-+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
-     /* Turn off -fschedule-insns by default.  It tends to make the
-        problem with not enough registers even worse.  */
-     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
-diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
-index b16df5b183e..39b2468799c 100644
---- a/gcc/config/i386/i386.cc
-+++ b/gcc/config/i386/i386.cc
-@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
-   unsigned i;
-   unsigned mem_count = 0;
- 
-+  /* Unroll small size loop when unroll factor is not explicitly
-+     specified.  */
-+  if (!(flag_unroll_loops
-+	|| flag_unroll_all_loops
-+	|| loop->unroll))
-+    {
-+      nunroll = 1;
-+
-+      /* Any explicit -f{no-}unroll-{all-}loops turns off
-+	 -munroll-only-small-loops.  */
-+      if (ix86_unroll_only_small_loops
-+	  && !OPTION_SET_P (flag_unroll_loops)
-+	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
-+	nunroll = ix86_cost->small_unroll_factor;
-+
-+      return nunroll;
-+    }
-+
-   if (!TARGET_ADJUST_UNROLL)
-      return nunroll;
- 
-diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
-index a61c32b8957..421801111a7 100644
---- a/gcc/config/i386/i386.h
-+++ b/gcc/config/i386/i386.h
-@@ -219,6 +219,11 @@ struct processor_costs {
-   const char *const align_jump;		/* Jump alignment.  */
-   const char *const align_label;	/* Label alignment.  */
-   const char *const align_func;		/* Function alignment.  */
-+
-+  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
-+					   to be unrolled.  */
-+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
-+					   be unrolled.  */
- };
- 
- extern const struct processor_costs *ix86_cost;
-diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
-index a6b0e28f238..3d369647bf7 100644
---- a/gcc/config/i386/i386.opt
-+++ b/gcc/config/i386/i386.opt
-@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
- -param=x86-stlf-window-ninsns=
- Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
- Instructions number above which STFL stall penalty can be compensated.
-+
-+munroll-only-small-loops
-+Target Var(ix86_unroll_only_small_loops) Init(0) Save
-+Enable conservative small loop unrolling.
-diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
-index 017ffa69958..b4303e4e971 100644
---- a/gcc/config/i386/x86-tune-costs.h
-+++ b/gcc/config/i386/x86-tune-costs.h
-@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* Processor costs (relative to an add) */
-@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
-   "4",					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   "4",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs i486_memcpy[2] = {
-@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs pentium_memcpy[2] = {
-@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static const
-@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
-@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs geode_memcpy[2] = {
-@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs k6_memcpy[2] = {
-@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
-   "32:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "32",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* For some reason, Athlon deals better with REP prefix (relative to loops)
-@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* K8 has optimized REP instruction for medium sized blocks, but for very
-@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
-@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
-   "32:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "32",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /*  BDVER has optimized REP instruction for medium sized blocks, but for
-@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- 
-@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
-@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- struct processor_costs znver3_cost = {
-@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
-@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* icelake_cost should produce code tuned for Icelake family of CPUs.
-@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
-@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
-   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
-@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs btver2_memcpy[2] = {
-@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs pentium4_memcpy[2] = {
-@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs nocona_memcpy[2] = {
-@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs atom_memcpy[2] = {
-@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs slm_memcpy[2] = {
-@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs tremont_memcpy[2] = {
-@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs intel_memcpy[2] = {
-@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* Generic should produce code tuned for Core-i7 (and newer chips)
-@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* core_cost should produce code tuned for Core familly of CPUs.  */
-@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
-diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
-index 9ac7f89ebb1..1961cafa2bb 100644
---- a/gcc/doc/invoke.texi
-+++ b/gcc/doc/invoke.texi
-@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options.
- -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
- -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
- -mindirect-branch-register -mharden-sls=@var{choice} @gol
---mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
-+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
-+-munroll-only-small-loops}
- 
- @emph{x86 Windows Options}
- @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
-@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols.  The default is
- @option{-mno-direct-extern-access} and executable compiled with
- @option{-mdirect-extern-access} may not be binary compatible if
- protected symbols are used in shared libraries and executable.
-+
-+@item -munroll-only-small-loops
-+@opindex munroll-only-small-loops
-+@opindex mno-unroll-only-small-loops
-+Controls conservative small loop unrolling. It is default enabled by
-+O2, and unrolls loop with less than 4 insns by 1 time. Explicit
-+-f[no-]unroll-[all-]loops would disable this flag to avoid any
-+unintended unrolling behavior that user does not want.
- @end table
- 
- @node x86 Windows Options
-diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
-index 1e4f6cfd7fb..84336865ef7 100644
---- a/gcc/loop-init.cc
-+++ b/gcc/loop-init.cc
-@@ -565,9 +565,12 @@ public:
-   {}
- 
-   /* opt_pass methods: */
--  virtual bool gate (function *)
-+  virtual bool gate (function * fun)
-     {
--      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
-+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
-+	      || (targetm.loop_unroll_adjust
-+		  && optimize >= 2
-+		  && optimize_function_for_speed_p (fun)));
-     }
- 
-   virtual unsigned int execute (function *);
-@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
-       if (dump_file)
- 	df_dump (dump_file);
- 
--      if (flag_unroll_loops)
-+      if (flag_unroll_loops
-+	  || targetm.loop_unroll_adjust)
- 	flags |= UAP_UNROLL;
-       if (flag_unroll_all_loops)
- 	flags |= UAP_UNROLL_ALL;
-diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
-index 1b1f6d32271..a32ea445a3f 100644
---- a/gcc/testsuite/gcc.dg/guality/loop-1.c
-+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
-@@ -1,5 +1,7 @@
- /* { dg-do run } */
- /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
-+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
-+
- 
- #include "../nop.h"
- 
-diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
-index 81841ef5bd7..cbc9fbb0450 100644
---- a/gcc/testsuite/gcc.target/i386/pr86270.c
-+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
- 
- int *a;
- long len;
-diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
-index 0248fcc00a5..f75a847f75d 100644
---- a/gcc/testsuite/gcc.target/i386/pr93002.c
-+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
-@@ -1,6 +1,6 @@
- /* PR target/93002 */
- /* { dg-do compile } */
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
- /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
- 
- volatile int sink;
--- 
-2.18.2
-
diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
deleted file mode 100644
index de3995f..0000000
--- a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
+++ /dev/null
@@ -1,231 +0,0 @@
-From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001
-From: Hongyu Wang <hongyu.wang@intel.com>
-Date: Sat, 19 Nov 2022 09:38:00 +0800
-Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR
- 107692]
-
-Followed by the discussion in pr107692, -munroll-only-small-loops
-Does not turns on/off -funroll-loops, and current check in
-pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
-effect. Revert the change about targetm.loop_unroll_adjust and apply
-the backend option change to strictly follow the rule that
--funroll-loops takes full control of loop unrolling, and
-munroll-only-small-loops just change its behavior to unroll small size
-loops.
-
-gcc/ChangeLog:
-
-	PR target/107692
-	* common/config/i386/i386-common.cc (ix86_optimization_table):
-	Enable loop unroll O2, disable -fweb and -frename-registers
-	by default.
-	* config/i386/i386-options.cc
-	(ix86_override_options_after_change):
-	Disable small loop unroll when funroll-loops enabled, reset
-	cunroll_grow_size when it is not explicitly enabled.
-	(ix86_option_override_internal): Call
-	ix86_override_options_after_change instead of calling
-	ix86_recompute_optlev_based_flags and ix86_default_align
-	separately.
-	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
-	factor if -munroll-only-small-loops enabled.
-	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
-	loop unrolling for -O2-speed.
-	(pass_rtl_unroll_loops::execute): Rmove
-	targetm.loop_unroll_adjust check.
-
-gcc/testsuite/ChangeLog:
-
-	PR target/107692
-	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
-	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
-	* gcc.target/i386/pr93002.c: Likewise.
----
- gcc/common/config/i386/i386-common.cc   |  8 ++++++
- gcc/config/i386/i386-options.cc         | 34 ++++++++++++++++++++++---
- gcc/config/i386/i386.cc                 | 18 ++++---------
- gcc/loop-init.cc                        | 11 +++-----
- gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 --
- gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
- gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
- 7 files changed, 49 insertions(+), 28 deletions(-)
-
-diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
-index e1c1fb07d8a..5e777849f91 100644
---- a/gcc/common/config/i386/i386-common.cc
-+++ b/gcc/common/config/i386/i386-common.cc
-@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] =
-     /* The STC algorithm produces the smallest code at -Os, for x86.  */
-     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
-       REORDER_BLOCKS_ALGORITHM_STC },
-+
-+    /* Turn on -funroll-loops with -munroll-only-small-loops to enable small
-+       loop unrolling at -O2.  */
-+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
-     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
-+    /* Turns off -frename-registers and -fweb which are enabled by
-+       funroll-loops.  */
-+    { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
-+    { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
-     /* Turn off -fschedule-insns by default.  It tends to make the
-        problem with not enough registers even worse.  */
-     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
-diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
-index 32cc58a764b..b853ff55825 100644
---- a/gcc/config/i386/i386-options.cc
-+++ b/gcc/config/i386/i386-options.cc
-@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts,
- void
- ix86_override_options_after_change (void)
- {
-+  /* Default align_* from the processor table.  */
-   ix86_default_align (&global_options);
-+
-   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
-+
-+  /* Disable unrolling small loops when there's explicit
-+     -f{,no}unroll-loop.  */
-+  if ((OPTION_SET_P (flag_unroll_loops))
-+     || (OPTION_SET_P (flag_unroll_all_loops)
-+	 && flag_unroll_all_loops))
-+    {
-+      if (!OPTION_SET_P (ix86_unroll_only_small_loops))
-+	ix86_unroll_only_small_loops = 0;
-+      /* Re-enable -frename-registers and -fweb if funroll-loops
-+	 enabled.  */
-+      if (!OPTION_SET_P (flag_web))
-+	flag_web = flag_unroll_loops;
-+      if (!OPTION_SET_P (flag_rename_registers))
-+	flag_rename_registers = flag_unroll_loops;
-+      /* -fcunroll-grow-size default follws -f[no]-unroll-loops.  */
-+      if (!OPTION_SET_P (flag_cunroll_grow_size))
-+	flag_cunroll_grow_size = flag_unroll_loops
-+				 || flag_peel_loops
-+				 || optimize >= 3;
-+    }
-+  else
-+    {
-+      if (!OPTION_SET_P (flag_cunroll_grow_size))
-+	flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
-+    }
-+
- }
- 
- /* Clear stack slot assignments remembered from previous functions.
-@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p,
- 
-   set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes);
- 
--  ix86_recompute_optlev_based_flags (opts, opts_set);
-+  ix86_override_options_after_change ();
- 
-   ix86_tune_cost = processor_cost_table[ix86_tune];
-   /* TODO: ix86_cost should be chosen at instruction or function granuality
-@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p,
-       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
-     opts->x_ix86_regparm = REGPARM_MAX;
- 
--  /* Default align_* from the processor table.  */
--  ix86_default_align (opts);
--
-   /* Provide default for -mbranch-cost= value.  */
-   SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost,
- 		       ix86_tune_cost->branch_cost);
-diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
-index 39b2468799c..000415c0e2e 100644
---- a/gcc/config/i386/i386.cc
-+++ b/gcc/config/i386/i386.cc
-@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
- 
-   /* Unroll small size loop when unroll factor is not explicitly
-      specified.  */
--  if (!(flag_unroll_loops
--	|| flag_unroll_all_loops
--	|| loop->unroll))
-+  if (ix86_unroll_only_small_loops && !loop->unroll)
-     {
--      nunroll = 1;
--
--      /* Any explicit -f{no-}unroll-{all-}loops turns off
--	 -munroll-only-small-loops.  */
--      if (ix86_unroll_only_small_loops
--	  && !OPTION_SET_P (flag_unroll_loops)
--	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
--	nunroll = ix86_cost->small_unroll_factor;
--
--      return nunroll;
-+      if (loop->ninsns <= ix86_cost->small_unroll_ninsns)
-+	return MIN (nunroll, ix86_cost->small_unroll_factor);
-+      else
-+	return 1;
-     }
- 
-   if (!TARGET_ADJUST_UNROLL)
-diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
-index 84336865ef7..ed1b2f6ebab 100644
---- a/gcc/loop-init.cc
-+++ b/gcc/loop-init.cc
-@@ -565,12 +565,10 @@ public:
-   {}
- 
-   /* opt_pass methods: */
--  virtual bool gate (function * fun)
-+  virtual bool gate (function *)
-     {
--      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
--	      || (targetm.loop_unroll_adjust
--		  && optimize >= 2
--		  && optimize_function_for_speed_p (fun)));
-+      return (flag_unroll_loops || flag_unroll_all_loops
-+	      || cfun->has_unroll);
-     }
- 
-   virtual unsigned int execute (function *);
-@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun)
-       if (dump_file)
- 	df_dump (dump_file);
- 
--      if (flag_unroll_loops
--	  || targetm.loop_unroll_adjust)
-+      if (flag_unroll_loops)
- 	flags |= UAP_UNROLL;
-       if (flag_unroll_all_loops)
- 	flags |= UAP_UNROLL_ALL;
-diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
-index a32ea445a3f..1b1f6d32271 100644
---- a/gcc/testsuite/gcc.dg/guality/loop-1.c
-+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
-@@ -1,7 +1,5 @@
- /* { dg-do run } */
- /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
--/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
--
- 
- #include "../nop.h"
- 
-diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
-index cbc9fbb0450..98b012caf23 100644
---- a/gcc/testsuite/gcc.target/i386/pr86270.c
-+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
-+/* { dg-options "-O2 -fno-unroll-loops" } */
- 
- int *a;
- long len;
-diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
-index f75a847f75d..7e2d869e17b 100644
---- a/gcc/testsuite/gcc.target/i386/pr93002.c
-+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
-@@ -1,6 +1,6 @@
- /* PR target/93002 */
- /* { dg-do compile } */
--/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
-+/* { dg-options "-O2 -fno-unroll-loops" } */
- /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
- 
- volatile int sink;
--- 
-2.18.2
-
diff --git a/download b/download
index 5e06243..7089174 100644
--- a/download
+++ b/download
@@ -1,5 +1,5 @@
 5ff66c50ca9288d9a3d695a031f6950c  doxygen-1.8.0.src.tar.gz
-69e8afb0efe379aebf6ce07f70511e25  gcc-12.1.1-20220507.tar.xz
+4b46c6ef416360a726fe71470d203985  gcc-12.1.1-20220628.tar.xz
 86ee6e54ebfc4a90b643a65e402c4048  gmp-6.1.0.tar.bz2
 11436d6b205e516635b666090b94ab32  isl-0.18.tar.bz2
 d6a1d5f8ddea3abd2cc3e98f58352d26  mpc-1.0.3.tar.gz
diff --git a/gcc.spec b/gcc.spec
index 977a4d1..02091d3 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,13 +2,13 @@
 %{?scl:%global __strip %%{_scl_root}/usr/bin/strip}
 %{?scl:%global __objdump %%{_scl_root}/usr/bin/objdump}
 %{?scl:%scl_package gcc}
-%global DATE 20220507
-%global gitrev fa107326a13af9a7d7aa0df28fe364db0f6fb171
+%global DATE 20220628
+%global gitrev 874cb9452c56f1c3b3a7b5bfed93a262504b9856
 %global gcc_version 12.1.1
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 1
+%global gcc_release 3
 %global nvptx_tools_gitrev 5f6f343a302d620b0868edab376c00b15741e39e
 %global newlib_cygwin_gitrev 50e2a63b04bdd018484605fbb954fd1bd5147fa0
 %global mpc_version 1.0.3
@@ -80,12 +80,12 @@
 %else
 %global build_libasan 0
 %endif
-%ifarch x86_64 ppc64 ppc64le aarch64
+%ifarch x86_64 ppc64 ppc64le aarch64 s390x
 %global build_libtsan 1
 %else
 %global build_libtsan 0
 %endif
-%ifarch x86_64 ppc64 ppc64le aarch64
+%ifarch x86_64 ppc64 ppc64le aarch64 s390x
 %global build_liblsan 1
 %else
 %global build_liblsan 0
@@ -147,7 +147,7 @@
 Summary: GCC version 12
 Name: %{?scl_prefix}gcc
 Version: %{gcc_version}
-Release: %{gcc_release}.8%{?dist}
+Release: %{gcc_release}.2%{?dist}
 # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have
 # GCC Runtime Exception.
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
@@ -195,7 +195,6 @@ URL: http://gcc.gnu.org
 # Need binutils which support --generate-missing-build-notes=yes >= 2.31
 %if 0%{?scl:1}
 BuildRequires: %{?scl_prefix}binutils >= 2.31
-# For testing
 BuildRequires: %{?scl_prefix}gdb >= 7.4.50
 %endif
 # While gcc doesn't include statically linked binaries, during testing
@@ -349,11 +348,15 @@ Patch8: gcc12-no-add-needed.patch
 Patch9: gcc12-Wno-format-security.patch
 Patch10: gcc12-rh1574936.patch
 Patch11: gcc12-d-shared-libphobos.patch
+Patch12: gcc12-pr105551.patch
+Patch13: gcc12-libtsan-s390x.patch
+# This has been backported to GCC 12, so eventually we can drop it.
+Patch14: gcc12-pr105991.patch
 
 Patch100: gcc12-fortran-fdec-duplicates.patch
 Patch101: gcc12-fortran-flogical-as-integer.patch
-Patch105: gcc12-fortran-fdec-override-kind.patch
-Patch106: gcc12-fortran-fdec-non-logical-if.patch
+Patch102: gcc12-fortran-fdec-override-kind.patch
+Patch103: gcc12-fortran-fdec-non-logical-if.patch
 
 Patch1000: gcc12-libstdc++-compat.patch
 Patch1001: gcc12-alt-compat-test.patch
@@ -383,11 +386,6 @@ Patch3016: 0019-xfails.patch
 Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
-Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
-Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
-Patch3022: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
-Patch3023: 0026-Enable-small-loop-unrolling-for-O2.patch
-Patch3024: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -729,12 +727,15 @@ so that there cannot be any synchronization problems.
 %patch10 -p0 -b .rh1574936~
 %endif
 %patch11 -p0 -b .d-shared-libphobos~
+%patch12 -p0 -b .pr105551~
+%patch13 -p0 -b .libtsan-s390x~
+%patch14 -p1 -b .pr105991~
 
 %if 0%{?rhel} >= 6
 %patch100 -p1 -b .fortran-fdec-duplicates~
 %patch101 -p1 -b .fortran-flogical-as-integer~
-%patch105 -p1 -b .fortran-fdec-override-kind~
-%patch106 -p1 -b .fortran-fdec-non-logical-if~
+%patch102 -p1 -b .fortran-fdec-override-kind~
+%patch103 -p1 -b .fortran-fdec-non-logical-if~
 %endif
 
 %ifarch %{arm}
@@ -790,11 +791,6 @@ cd ..
 %if 0%{?rhel} <= 7
 %patch3019 -p1 -b .dts-test-19~
 %endif
-%patch3020 -p1 -b .dts-test-20~
-%patch3021 -p1 -b .dts-test-21~
-%patch3022 -p1 -b .dts-test-22~
-%patch3023 -p1 -b .dts-test-23~
-%patch3024 -p1 -b .dts-test-24~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2249,6 +2245,7 @@ fi
 %{_prefix}/bin/gcc-ar%{!?scl:12}
 %{_prefix}/bin/gcc-nm%{!?scl:12}
 %{_prefix}/bin/gcc-ranlib%{!?scl:12}
+%{_prefix}/bin/lto-dump%{!?scl:12}
 %ifarch ppc
 %{_prefix}/bin/%{_target_platform}-gcc%{!?scl:12}
 %endif
@@ -2273,6 +2270,7 @@ fi
 %{_mandir}/man1/gcov.1*
 %{_mandir}/man1/gcov-tool.1*
 %{_mandir}/man1/gcov-dump.1*
+%{_mandir}/man1/lto-dump.1*
 %{_infodir}/gcc*
 %{_infodir}/cpp*
 %endif
@@ -2877,6 +2875,13 @@ fi
 %if 0%{?rhel} < 8
 %files -n liblsan
 %{?scl:%{_root_prefix}}%{!?scl:%{_prefix}}/%{_lib}/liblsan.so.0*
+%else
+%ifarch s390x
+# Except that on s390x we don't have the system liblsan, because we
+# only enabled LSan in GCC 12.  ??? Ugly duplication.
+%files -n liblsan
+%{?scl:%{_root_prefix}}%{!?scl:%{_prefix}}/%{_lib}/liblsan.so.0*
+%endif
 %endif
 
 %files -n %{?scl_prefix}liblsan-devel
@@ -2952,20 +2957,41 @@ fi
 %endif
 
 %changelog
-* Tue Nov 29 2022 Hongyu Wang <hongyu.wang@intel.com> 12.1.1-1.8
-- i386: Only enable small loop unrolling in backend [PR 107692]
-
-* Tue Nov 29 2022 Hongyu Wang <hongyu.wang@intel.com> 12.1.1-1.7
-- Enable small loop unrolling for O2
-
-* Thu Nov 10 2022 Cui Lili <lili.cui@intel.com> 12.1.1-1.6
-- Add attribute hot judgement for INLINE_HINT_known_hot hint
-
-* Thu Nov 10 2022 Cui Lili <lili.cui@intel.com> 12.1.1-1.5
-- Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
-
-* Thu Nov 10 2022 Haochen Jiang <haochen.jiang@intel.com> 12.1.1-1.4
-- i386: Add syscall to enable AMX for latest kernels
+* Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
+- recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
+
+* Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.1
+- always ship liblsan on s390x (#2104829)
+
+* Wed Jul  6 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3
+- update from releases/gcc-12 branch
+  - PRs c++/49387, c++/102307, c++/102651, c++/104470, c++/105491, c++/105589,
+	c++/105623, c++/105652, c++/105655, c++/105725, c++/105734,
+	c++/105756, c++/105761, c++/105779, c++/105795, c++/105852,
+	c++/105871, c++/105885, c++/105908, c++/105925, c++/105931,
+	c++/105964, c++/106001, c/105635, d/105544, fortran/105230,
+	gcov-profile/105535, ipa/100413, ipa/105600, ipa/105639, ipa/105739,
+	libgomp/105745, libgomp/106045, libstdc++/104731, libstdc++/105284,
+	libstdc++/105671, libstdc++/105681, middle-end/105537,
+	middle-end/105604, middle-end/105711, middle-end/105951,
+	middle-end/105998, middle-end/106030, other/105527,
+	preprocessor/105732, rtl-optimization/105455, rtl-optimization/105559,
+	rtl-optimization/105577, sanitizer/105714, sanitizer/105729,
+	target/101891, target/104871, target/105162, target/105209,
+	target/105292, target/105472, target/105556, target/105599,
+	target/105854, target/105879, target/105953, target/105960,
+	target/105970, target/105981, target/106096, tree-optimization/103116,
+	tree-optimization/105431, tree-optimization/105458,
+	tree-optimization/105528, tree-optimization/105562,
+	tree-optimization/105618, tree-optimization/105726,
+	tree-optimization/105736, tree-optimization/105786,
+	tree-optimization/105940
+- enable tsan and lsan on s390x (#2101610)
+- fix up libtsan on s390x
+- fix nvptx build (PRs bootstrap/105551, target/105938)
+
+* Tue Jun 28 2022 Marek Polacek <polacek@redhat.com> 12.1.1-1.4
+- ship lto-dump (#2101835)
 
 * Thu Jun 23 2022 Marek Polacek <polacek@redhat.com> 12.1.1-1.3
 - don't provide g++/fortran (CS-1145)
diff --git a/gcc12-libtsan-s390x.patch b/gcc12-libtsan-s390x.patch
new file mode 100644
index 0000000..4241d43
--- /dev/null
+++ b/gcc12-libtsan-s390x.patch
@@ -0,0 +1,17 @@
+commit r12-8527-g7811663964aa7e31c3939b859bbfa2e16919639f                                                                                                                             
+Author: Martin Liska <mliska@suse.cz>                                                                                                                                                 
+Date:   Wed Jun 29 15:28:07 2022 +0200                                                                                                                                                
+                                                                                                                                                                                      
+    libsanitizer: cherry-pick 791e0d1bc85d                                                                                                                                            
+                                                                                                                                                                                      
+    791e0d1bc85d: [compiler-rt] Add NO_EXEC_STACK_DIRECTIVE on s390x                                                                                                                  
+    (cherry picked from commit aa87b7541b4c11f59c521154513f844ea6b5c977)                                                                                                              
+
+--- libsanitizer/tsan/tsan_rtl_s390x.S
++++ libsanitizer/tsan/tsan_rtl_s390x.S
+@@ -45,3 +45,5 @@ intercept setjmp, _ZN14__interception11real_setjmpE
+ intercept _setjmp, _ZN14__interception12real__setjmpE
+ intercept sigsetjmp, _ZN14__interception14real_sigsetjmpE
+ intercept __sigsetjmp, _ZN14__interception16real___sigsetjmpE
++
++NO_EXEC_STACK_DIRECTIVE
diff --git a/gcc12-pr105551.patch b/gcc12-pr105551.patch
new file mode 100644
index 0000000..ce8be26
--- /dev/null
+++ b/gcc12-pr105551.patch
@@ -0,0 +1,28 @@
+2022-05-11  Richard Biener  <rguenther@suse.de>
+
+	PR bootstrap/105551
+	* opts.cc (finish_options): Also disable var-tracking if
+	!DWARF2_DEBUGGING_INFO.
+
+--- gcc/opts.cc
++++ gcc/opts.cc
+@@ -1334,11 +1334,15 @@ finish_options (struct gcc_options *opts, struct gcc_options *opts_set,
+ 	      || opts->x_flag_selective_scheduling2));
+ 
+   /* We know which debug output will be used so we can set flag_var_tracking
+-     and flag_var_tracking_uninit if the user has not specified them.  Note
+-     we have not yet initialized debug_hooks so we might uselessly run
+-     var-tracking on targets without var_location debug hook support.  */
++     and flag_var_tracking_uninit if the user has not specified them.  */
+   if (opts->x_debug_info_level < DINFO_LEVEL_NORMAL
+-      || !dwarf_debuginfo_p (opts))
++      || !dwarf_debuginfo_p (opts)
++      /* We have not yet initialized debug hooks so match that to check
++	 whether we're only doing DWARF2_LINENO_DEBUGGING_INFO.  */
++#ifndef DWARF2_DEBUGGING_INFO
++      || true
++#endif
++     )
+     {
+       if ((opts_set->x_flag_var_tracking && opts->x_flag_var_tracking == 1)
+ 	  || (opts_set->x_flag_var_tracking_uninit
diff --git a/gcc12-pr105991.patch b/gcc12-pr105991.patch
new file mode 100644
index 0000000..b9e12f4
--- /dev/null
+++ b/gcc12-pr105991.patch
@@ -0,0 +1,89 @@
+commit 6c175b3d170de2bb02b7bd45b3348eec05d28451
+Author: Roger Sayle <roger@nextmovesoftware.com>
+Date:   Mon Jul 4 13:58:37 2022 +0100
+
+    PR target/105991: Recognize PLUS and XOR forms of rldimi in rs6000.md.
+    
+    This patch addresses PR target/105991 where a change to prefer representing
+    shifts and adds at the tree-level as multiplications, causes problems for
+    the rldimi patterns in the powerpc backend.  The issue is that rs6000.md
+    models this pattern using IOR, and some variants that have the equivalent
+    PLUS or XOR in the RTL fail to match some *rotl<mode>4_insert patterns.
+    This is fixed in this patch by adding a define_insn_and_split to locally
+    canonicalize the PLUS and XOR forms to the backend's preferred IOR form.
+    
+    Backported from master.
+    
+    2022-07-04  Roger Sayle  <roger@nextmovesoftware.com>
+                Marek Polacek  <polacek@redhat.com>
+                Segher Boessenkool  <segher@kernel.crashing.org>
+                Kewen Lin  <linkw@linux.ibm.com>
+    
+    gcc/ChangeLog
+            PR target/105991
+            * config/rs6000/rs6000.md (rotl<mode>3_insert_3): Check that
+            exact_log2 doesn't return -1 (or zero).
+            (plus_xor): New code iterator.
+            (*rotl<mode>3_insert_3_<code>): New define_insn_and_split.
+    
+    gcc/testsuite/ChangeLog
+            PR target/105991
+            * gcc.target/powerpc/pr105991.c: New test case.
+
+diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
+index 64049a6e521..6082ded8c31 100644
+--- a/gcc/config/rs6000/rs6000.md
++++ b/gcc/config/rs6000/rs6000.md
+@@ -4178,7 +4178,8 @@ (define_insn "rotl<mode>3_insert_3"
+ 			  (match_operand:GPR 4 "const_int_operand" "n"))
+ 		 (ashift:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
+ 			     (match_operand:SI 2 "const_int_operand" "n"))))]
+-  "INTVAL (operands[2]) == exact_log2 (UINTVAL (operands[4]) + 1)"
++  "INTVAL (operands[2]) > 0
++   && INTVAL (operands[2]) == exact_log2 (UINTVAL (operands[4]) + 1)"
+ {
+   if (<MODE>mode == SImode)
+     return "rlwimi %0,%1,%h2,0,31-%h2";
+@@ -4187,6 +4188,24 @@ (define_insn "rotl<mode>3_insert_3"
+ }
+   [(set_attr "type" "insert")])
+ 
++; Canonicalize the PLUS and XOR forms to IOR for rotl<mode>3_insert_3
++(define_code_iterator plus_xor [plus xor])
++
++(define_insn_and_split "*rotl<mode>3_insert_3_<code>"
++  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
++	(plus_xor:GPR
++	  (and:GPR (match_operand:GPR 3 "gpc_reg_operand" "0")
++		   (match_operand:GPR 4 "const_int_operand" "n"))
++	  (ashift:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
++		      (match_operand:SI 2 "const_int_operand" "n"))))]
++  "INTVAL (operands[2]) > 0
++   && INTVAL (operands[2]) == exact_log2 (UINTVAL (operands[4]) + 1)"
++  "#"
++  "&& 1"
++  [(set (match_dup 0)
++	(ior:GPR (and:GPR (match_dup 3) (match_dup 4))
++		 (ashift:GPR (match_dup 1) (match_dup 2))))])
++
+ (define_code_iterator plus_ior_xor [plus ior xor])
+ 
+ (define_split
+diff --git a/gcc/testsuite/gcc.target/powerpc/pr105991.c b/gcc/testsuite/gcc.target/powerpc/pr105991.c
+new file mode 100644
+index 00000000000..0d9d130cb63
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/powerpc/pr105991.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++/* { dg-require-effective-target lp64 } */
++unsigned long long
++foo (unsigned long long value)
++{
++  value &= 0xffffffff;
++  value |= value << 32;
++  return value;
++}
++/* { dg-final { scan-assembler {\mrldimi\M} } } */
++
-- 
Gitee


From 254eae38118b17cc1b48072a684ec58d4f67eb48 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Thu, 10 Nov 2022 09:40:26 +0800
Subject: [PATCH 2/6] i386: Add syscall to enable AMX for latest kernels

gcc/testsuite/ChangeLog:

	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
	New function to check if AMX is usable and enable AMX.
	(main): Run test if AMX is usable.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5e377d21f1f345d8b157b9bc306e02bb9bd45e01
---
 ...all-to-enable-AMX-for-latest-kernels.patch | 77 +++++++++++++++++++
 gcc.spec                                      |  8 +-
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch

diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
new file mode 100644
index 0000000..94625b5
--- /dev/null
+++ b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
@@ -0,0 +1,77 @@
+From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001
+From: Haochen Jiang <haochen.jiang@intel.com>
+Date: Thu, 16 Jun 2022 00:15:53 -0700
+Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
+	New function to check if AMX is usable and enable AMX.
+	(main): Run test if AMX is usable.
+---
+ gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
+index 434b0e59703..6fff5ff4631 100644
+--- a/gcc/testsuite/gcc.target/i386/amx-check.h
++++ b/gcc/testsuite/gcc.target/i386/amx-check.h
+@@ -4,11 +4,24 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <stdint.h>
++#include <unistd.h>
++#ifdef __linux__
++#include <sys/syscall.h>
++#endif
+ #ifdef DEBUG
+ #include <stdio.h>
+ #endif
+ #include "cpuid.h"
+ 
++#define XFEATURE_XTILECFG	17
++#define XFEATURE_XTILEDATA	18
++#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
++#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
++#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
++
++#define ARCH_GET_XCOMP_PERM	0x1022
++#define ARCH_REQ_XCOMP_PERM	0x1023
++
+ /* TODO: The tmm emulation is temporary for current
+    AMX implementation with no tmm regclass, should
+    be changed in the future. */
+@@ -44,6 +57,20 @@ typedef struct __tile
+ /* Stride (colum width in byte) used for tileload/store */
+ #define _STRIDE 64
+ 
++#ifdef __linux__
++/* We need syscall to use amx functions */
++int request_perm_xtile_data()
++{
++  unsigned long bitmask;
++
++  if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) ||
++      syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask))
++    return 0;
++
++  return (bitmask & XFEATURE_MASK_XTILE) != 0;
++}
++#endif
++
+ /* Initialize tile config by setting all tmm size to 16x64 */
+ void init_tile_config (__tilecfg_u *dst)
+ {
+@@ -185,6 +212,9 @@ main ()
+ #endif
+ #ifdef AMX_BF16
+       && __builtin_cpu_supports ("amx-bf16")
++#endif
++#ifdef __linux__
++      && request_perm_xtile_data ()
+ #endif
+       )
+     {
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 02091d3..ca14fb6 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -1,3 +1,4 @@
+%define anolis_release .0.1
 %global __python /usr/bin/python3
 %{?scl:%global __strip %%{_scl_root}/usr/bin/strip}
 %{?scl:%global __objdump %%{_scl_root}/usr/bin/objdump}
@@ -147,7 +148,7 @@
 Summary: GCC version 12
 Name: %{?scl_prefix}gcc
 Version: %{gcc_version}
-Release: %{gcc_release}.2%{?dist}
+Release: %{gcc_release}.2%{anolis_release}%{?dist}
 # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have
 # GCC Runtime Exception.
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
@@ -386,6 +387,7 @@ Patch3016: 0019-xfails.patch
 Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
+Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -791,6 +793,7 @@ cd ..
 %if 0%{?rhel} <= 7
 %patch3019 -p1 -b .dts-test-19~
 %endif
+%patch3020 -p1 -b .dts-test-20~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2957,6 +2960,9 @@ fi
 %endif
 
 %changelog
+* Thu Dec 22 2022 Haochen Jiang <haochen.jiang@intel.com> 12.1.1-3.2.0.1
+- i386: Add syscall to enable AMX for latest kernels
+
 * Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
 
-- 
Gitee


From 94abbff49f20b8d5664b2973f97cf21b240e930e Mon Sep 17 00:00:00 2001
From: "Cui,Lili" <lili.cui@intel.com>
Date: Thu, 10 Nov 2022 09:45:02 +0800
Subject: [PATCH 3/6] Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS

gcc/ChangeLog:

	* config/i386/driver-i386.cc (host_detect_local_cpu):
	Move sapphirerapids out of AVX512_VP2INTERSECT.
	* config/i386/i386.h: Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
	* doc/invoke.texi: Remove AVX512_VP2INTERSECT from SAPPHIRERAPIDS

url: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=11c72f20d4d7ba1862a257cef05dc3a5e84a276d
---
 ...VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch | 83 +++++++++++++++++++
 gcc.spec                                      |  3 +
 2 files changed, 86 insertions(+)
 create mode 100644 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch

diff --git a/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch b/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
new file mode 100644
index 0000000..42cabc2
--- /dev/null
+++ b/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
@@ -0,0 +1,83 @@
+From 11c72f20d4d7ba1862a257cef05dc3a5e84a276d Mon Sep 17 00:00:00 2001
+From: "Cui,Lili" <lili.cui@intel.com>
+Date: Thu, 29 Sep 2022 14:28:06 +0800
+Subject: [PATCH] Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
+
+gcc/ChangeLog:
+
+	* config/i386/driver-i386.cc (host_detect_local_cpu):
+	Move sapphirerapids out of AVX512_VP2INTERSECT.
+	* config/i386/i386.h: Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
+	* doc/invoke.texi: Remove AVX512_VP2INTERSECT from SAPPHIRERAPIDS
+---
+ gcc/config/i386/driver-i386.cc | 13 +++++--------
+ gcc/config/i386/i386.h         |  7 +++----
+ gcc/doc/invoke.texi            |  8 ++++----
+ 3 files changed, 12 insertions(+), 16 deletions(-)
+
+diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
+index 3c702fdca33..ef567045c67 100644
+--- a/gcc/config/i386/driver-i386.cc
++++ b/gcc/config/i386/driver-i386.cc
+@@ -589,15 +589,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
+ 	      /* This is unknown family 0x6 CPU.  */
+ 	      if (has_feature (FEATURE_AVX))
+ 		{
++		  /* Assume Tiger Lake */
+ 		  if (has_feature (FEATURE_AVX512VP2INTERSECT))
+-		    {
+-		      if (has_feature (FEATURE_TSXLDTRK))
+-			/* Assume Sapphire Rapids.  */
+-			cpu = "sapphirerapids";
+-		      else
+-			/* Assume Tiger Lake */
+-			cpu = "tigerlake";
+-		    }
++		    cpu = "tigerlake";
++		  /* Assume Sapphire Rapids.  */
++		  else if (has_feature (FEATURE_TSXLDTRK))
++		    cpu = "sapphirerapids";
+ 		  /* Assume Cooper Lake */
+ 		  else if (has_feature (FEATURE_AVX512BF16))
+ 		    cpu = "cooperlake";
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index 900a3bc3673..372a2cff8fe 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -2326,10 +2326,9 @@ constexpr wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT
+ constexpr wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI
+   | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT | PTA_KL | PTA_WIDEKL;
+ constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_ICELAKE_SERVER | PTA_MOVDIRI
+-  | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE
+-  | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE
+-  | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16
+-  | PTA_AVX512BF16;
++  | PTA_MOVDIR64B | PTA_ENQCMD | PTA_CLDEMOTE | PTA_PTWRITE | PTA_WAITPKG
++  | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE | PTA_AMX_INT8 | PTA_AMX_BF16
++  | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16 | PTA_AVX512BF16;
+ constexpr wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF
+   | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD | PTA_PREFETCHWT1;
+ constexpr wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 271c8bb8468..a9ecc4426a4 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -32057,11 +32057,11 @@ Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
+ SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE,
+ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
+ AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
+-AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2
++AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
+ VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
+-MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
+-SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16
+-and AVX512BF16 instruction set support.
++MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK,
++UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16 and AVX512BF16
++instruction set support.
+ 
+ @item alderlake
+ Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3,
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index ca14fb6..36df846 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -388,6 +388,7 @@ Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
+Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -794,6 +795,7 @@ cd ..
 %patch3019 -p1 -b .dts-test-19~
 %endif
 %patch3020 -p1 -b .dts-test-20~
+%patch3021 -p1 -b .dts-test-21~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2962,6 +2964,7 @@ fi
 %changelog
 * Thu Dec 22 2022 Haochen Jiang <haochen.jiang@intel.com> 12.1.1-3.2.0.1
 - i386: Add syscall to enable AMX for latest kernels
+- Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
 
 * Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
-- 
Gitee


From 01bb5ff1516e27cc99b5603d584153dae67016ec Mon Sep 17 00:00:00 2001
From: "Cui,Lili" <lili.cui@intel.com>
Date: Thu, 10 Nov 2022 09:48:30 +0800
Subject: [PATCH 4/6] Add attribute hot judgement for INLINE_HINT_known_hot
 hint.

We set up INLINE_HINT_known_hot hint only when we have profile feedback,
now add function attribute judgement for it, when both caller and callee
have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
for it.

With this patch applied,
ADL Multi-copy:    538.imagic_r  16.7%
ICX Multi-copy:    538.imagic_r  15.2%
CLX Multi-copy:    538.imagic_r  12.7%
Znver3 Multi-copy: 538.imagic_r  10.6%
Arm Multi-copy:    538.imagic_r  13.4%

gcc/ChangeLog

	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
	judgement for INLINE_HINT_known_hot hint.

gcc/testsuite/ChangeLog:

	* gcc.dg/ipa/inlinehint-6.c: New test.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a
---
 ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 126 insertions(+)
 create mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch

diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
new file mode 100644
index 0000000..3e70f0c
--- /dev/null
+++ b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
@@ -0,0 +1,123 @@
+From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001
+From: "Cui,Lili" <lili.cui@intel.com>
+Date: Tue, 1 Nov 2022 09:16:49 +0800
+Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint.
+
+We set up INLINE_HINT_known_hot hint only when we have profile feedback,
+now add function attribute judgement for it, when both caller and callee
+have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
+for it.
+
+With this patch applied,
+ADL Multi-copy:    538.imagic_r  16.7%
+ICX Multi-copy:    538.imagic_r  15.2%
+CLX Multi-copy:    538.imagic_r  12.7%
+Znver3 Multi-copy: 538.imagic_r  10.6%
+Arm Multi-copy:    538.imagic_r  13.4%
+
+gcc/ChangeLog
+
+	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
+	judgement for INLINE_HINT_known_hot hint.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/ipa/inlinehint-6.c: New test.
+---
+ gcc/ipa-inline-analysis.cc              | 13 ++++---
+ gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++
+ 2 files changed, 56 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+
+diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
+index 1ca685d1b0e..7bd29c36590 100644
+--- a/gcc/ipa-inline-analysis.cc
++++ b/gcc/ipa-inline-analysis.cc
+@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ipa-utils.h"
+ #include "cfgexpand.h"
+ #include "gimplify.h"
++#include "attribs.h"
+ 
+ /* Cached node/edge growths.  */
+ fast_call_summary<edge_growth_cache_entry *, va_heap> *edge_growth_cache = NULL;
+@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
+       hints = estimates.hints;
+     }
+ 
+-  /* When we have profile feedback, we can quite safely identify hot
+-     edges and for those we disable size limits.  Don't do that when
+-     probability that caller will call the callee is low however, since it
++  /* When we have profile feedback or function attribute, we can quite safely
++     identify hot edges and for those we disable size limits.  Don't do that
++     when probability that caller will call the callee is low however, since it
+      may hurt optimization of the caller's hot path.  */
+-  if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
++  if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
+       && (edge->count.ipa ().apply_scale (2, 1)
+ 	  > (edge->caller->inlined_to
+ 	     ? edge->caller->inlined_to->count.ipa ()
+ 	     : edge->caller->count.ipa ())))
++      || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl))
++	  != NULL
++	 && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl))
++	  != NULL))
+     hints |= INLINE_HINT_known_hot;
+ 
+   gcc_checking_assert (size >= 0);
+diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+new file mode 100644
+index 00000000000..1f3be641c6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+@@ -0,0 +1,47 @@
++/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
++/* { dg-add-options bind_pic_locally } */
++
++#define size_t long long int
++
++struct A
++{
++  size_t f1, f2, f3, f4;
++};
++struct C
++{
++  struct A a;
++  size_t b;
++};
++struct C x;
++
++__attribute__((hot)) struct C callee (struct A *a, struct C *c)
++{
++  c->a=(*a);
++
++  if((c->b + 7) & 17)
++   {
++      c->a.f1 = c->a.f2 + c->a.f1;
++      c->a.f2 = c->a.f3 - c->a.f2;
++      c->a.f3 = c->a.f2 + c->a.f3;
++      c->a.f4 = c->a.f2 - c->a.f4;
++      c->b = c->a.f2;
++
++    }
++  return *c;
++}
++
++__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
++{
++  struct A a;
++  a.f1 = 1 + d;
++  a.f2 = e;
++  a.f3 = 12 + f;
++  a.f4 = 68 + g;
++  if (c->b > 0)
++    return callee (&a, c);
++  else
++    return *c;
++}
++
++/* { dg-final { scan-ipa-dump "known_hot"  "inline"  } } */
++
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 36df846..d512293 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -389,6 +389,7 @@ Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
+Patch3022: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -796,6 +797,7 @@ cd ..
 %endif
 %patch3020 -p1 -b .dts-test-20~
 %patch3021 -p1 -b .dts-test-21~
+%patch3022 -p1 -b .dts-test-22~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2965,6 +2967,7 @@ fi
 * Thu Dec 22 2022 Haochen Jiang <haochen.jiang@intel.com> 12.1.1-3.2.0.1
 - i386: Add syscall to enable AMX for latest kernels
 - Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
+- Add attribute hot judgement for INLINE_HINT_known_hot hint
 
 * Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
-- 
Gitee


From 9a61d4f702ca6f5bf4eae11aa9bfb565de73ca22 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Tue, 29 Nov 2022 14:18:12 +0800
Subject: [PATCH 5/6] Enable small loop unrolling for O2

Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

	* common/config/i386/i386-common.cc (ix86_optimization_table):
	Enable small loop unroll at O2 by default.
	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
	factor if -munroll-only-small-loops enabled and -funroll-loops/
	-funroll-all-loops are disabled.
	* config/i386/i386.h (struct processor_costs): Add 2 field
	small_unroll_ninsns and small_unroll_factor.
	* config/i386/i386.opt: Add -munroll-only-small-loops.
	* doc/invoke.texi: Document -munroll-only-small-loops.
	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
	loop unrolling for -O2-speed and above if target hook
	loop_unroll_adjust exists.
	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
	when target hook loop_unroll_adjust exists.
	* config/i386/x86-tune-costs.h: Update all processor costs
	with small_unroll_ninsns = 4 and small_unroll_factor = 2.

gcc/testsuite/ChangeLog:

	* gcc.dg/guality/loop-1.c: Add additional option
	-mno-unroll-only-small-loops.
	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
	* gcc.target/i386/pr93002.c: Likewise.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=071e428c24ee8c1ed062597a093708bba29509c9
---
 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 484 insertions(+)
 create mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch

diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch
new file mode 100644
index 0000000..b16171b
--- /dev/null
+++ b/0026-Enable-small-loop-unrolling-for-O2.patch
@@ -0,0 +1,481 @@
+From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Thu, 8 Sep 2022 16:52:02 +0800
+Subject: [PATCH 4/5] Enable small loop unrolling for O2
+
+Modern processors has multiple way instruction decoders
+For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
+instructions (usually has 3 uops with a cmp/jmp pair that can be
+macro-fused), the decoder would have 2 uops bubble for each iteration
+and the pipeline could not be fully utilized.
+
+Therefore, this patch enables loop unrolling for small size loop at O2
+to fullfill the decoder as much as possible. It turns on rtl loop
+unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
+In x86 backend the default behavior is to unroll small loops with less
+than 4 insns by 1 time.
+
+This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
+0.9% codesize increment. For other benchmarks the variants are minor
+and overall codesize increased by 0.2%.
+
+The kernel image size increased by 0.06%, and no impact on eembc.
+
+gcc/ChangeLog:
+
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable small loop unroll at O2 by default.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled and -funroll-loops/
+	-funroll-all-loops are disabled.
+	* config/i386/i386.h (struct processor_costs): Add 2 field
+	small_unroll_ninsns and small_unroll_factor.
+	* config/i386/i386.opt: Add -munroll-only-small-loops.
+	* doc/invoke.texi: Document -munroll-only-small-loops.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
+	loop unrolling for -O2-speed and above if target hook
+	loop_unroll_adjust exists.
+	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
+	when target hook loop_unroll_adjust exists.
+	* config/i386/x86-tune-costs.h: Update all processor costs
+	with small_unroll_ninsns = 4 and small_unroll_factor = 2.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/guality/loop-1.c: Add additional option
+	-mno-unroll-only-small-loops.
+	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  1 +
+ gcc/config/i386/i386.cc                 | 18 ++++++++
+ gcc/config/i386/i386.h                  |  5 +++
+ gcc/config/i386/i386.opt                |  4 ++
+ gcc/config/i386/x86-tune-costs.h        | 56 +++++++++++++++++++++++++
+ gcc/doc/invoke.texi                     | 11 ++++-
+ gcc/loop-init.cc                        | 10 +++--
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 10 files changed, 105 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index 07fdd045f30..e1c1fb07d8a 100644
+--- a/gcc/common/config/i386/i386-common.cc
++++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
++    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index b16df5b183e..39b2468799c 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+   unsigned i;
+   unsigned mem_count = 0;
+ 
++  /* Unroll small size loop when unroll factor is not explicitly
++     specified.  */
++  if (!(flag_unroll_loops
++	|| flag_unroll_all_loops
++	|| loop->unroll))
++    {
++      nunroll = 1;
++
++      /* Any explicit -f{no-}unroll-{all-}loops turns off
++	 -munroll-only-small-loops.  */
++      if (ix86_unroll_only_small_loops
++	  && !OPTION_SET_P (flag_unroll_loops)
++	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
++	nunroll = ix86_cost->small_unroll_factor;
++
++      return nunroll;
++    }
++
+   if (!TARGET_ADJUST_UNROLL)
+      return nunroll;
+ 
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index a61c32b8957..421801111a7 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -219,6 +219,11 @@ struct processor_costs {
+   const char *const align_jump;		/* Jump alignment.  */
+   const char *const align_label;	/* Label alignment.  */
+   const char *const align_func;		/* Function alignment.  */
++
++  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
++					   to be unrolled.  */
++  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
++					   be unrolled.  */
+ };
+ 
+ extern const struct processor_costs *ix86_cost;
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index a6b0e28f238..3d369647bf7 100644
+--- a/gcc/config/i386/i386.opt
++++ b/gcc/config/i386/i386.opt
+@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
+ -param=x86-stlf-window-ninsns=
+ Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+ Instructions number above which STFL stall penalty can be compensated.
++
++munroll-only-small-loops
++Target Var(ix86_unroll_only_small_loops) Init(0) Save
++Enable conservative small loop unrolling.
+diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
+index 017ffa69958..b4303e4e971 100644
+--- a/gcc/config/i386/x86-tune-costs.h
++++ b/gcc/config/i386/x86-tune-costs.h
+@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Processor costs (relative to an add) */
+@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
+   "4",					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   "4",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs i486_memcpy[2] = {
+@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium_memcpy[2] = {
+@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static const
+@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs geode_memcpy[2] = {
+@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs k6_memcpy[2] = {
+@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* For some reason, Athlon deals better with REP prefix (relative to loops)
+@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* K8 has optimized REP instruction for medium sized blocks, but for very
+@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  BDVER has optimized REP instruction for medium sized blocks, but for
+@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ 
+@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
+@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ struct processor_costs znver3_cost = {
+@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
+@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* icelake_cost should produce code tuned for Icelake family of CPUs.
+@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
+@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs btver2_memcpy[2] = {
+@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium4_memcpy[2] = {
+@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs nocona_memcpy[2] = {
+@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs atom_memcpy[2] = {
+@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs slm_memcpy[2] = {
+@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs tremont_memcpy[2] = {
+@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs intel_memcpy[2] = {
+@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Generic should produce code tuned for Core-i7 (and newer chips)
+@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* core_cost should produce code tuned for Core familly of CPUs.  */
+@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 9ac7f89ebb1..1961cafa2bb 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options.
+ -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
+ -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
+ -mindirect-branch-register -mharden-sls=@var{choice} @gol
+--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
++-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
++-munroll-only-small-loops}
+ 
+ @emph{x86 Windows Options}
+ @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
+@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols.  The default is
+ @option{-mno-direct-extern-access} and executable compiled with
+ @option{-mdirect-extern-access} may not be binary compatible if
+ protected symbols are used in shared libraries and executable.
++
++@item -munroll-only-small-loops
++@opindex munroll-only-small-loops
++@opindex mno-unroll-only-small-loops
++Controls conservative small loop unrolling. It is default enabled by
++O2, and unrolls loop with less than 4 insns by 1 time. Explicit
++-f[no-]unroll-[all-]loops would disable this flag to avoid any
++unintended unrolling behavior that user does not want.
+ @end table
+ 
+ @node x86 Windows Options
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index 1e4f6cfd7fb..84336865ef7 100644
+--- a/gcc/loop-init.cc
++++ b/gcc/loop-init.cc
+@@ -565,9 +565,12 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function *)
++  virtual bool gate (function * fun)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
++      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
++	      || (targetm.loop_unroll_adjust
++		  && optimize >= 2
++		  && optimize_function_for_speed_p (fun)));
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops)
++      if (flag_unroll_loops
++	  || targetm.loop_unroll_adjust)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index 1b1f6d32271..a32ea445a3f 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,5 +1,7 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
++/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
++
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index 81841ef5bd7..cbc9fbb0450 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
++++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index 0248fcc00a5..f75a847f75d 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
++++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index d512293..2190a05 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -390,6 +390,7 @@ Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
 Patch3022: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+Patch3023: 0026-Enable-small-loop-unrolling-for-O2.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -798,6 +799,7 @@ cd ..
 %patch3020 -p1 -b .dts-test-20~
 %patch3021 -p1 -b .dts-test-21~
 %patch3022 -p1 -b .dts-test-22~
+%patch3023 -p1 -b .dts-test-23~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2968,6 +2970,7 @@ fi
 - i386: Add syscall to enable AMX for latest kernels
 - Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
 - Add attribute hot judgement for INLINE_HINT_known_hot hint
+- Enable small loop unrolling for O2
 
 * Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
-- 
Gitee


From 77a8d3e201ba51955518a8594f8e69b6c5020ffb Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Tue, 29 Nov 2022 14:22:58 +0800
Subject: [PATCH 6/6] i386: Only enable small loop unrolling in backend [PR
 107692]

Followed by the discussion in pr107692, -munroll-only-small-loops
Does not turns on/off -funroll-loops, and current check in
pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
effect. Revert the change about targetm.loop_unroll_adjust and apply
the backend option change to strictly follow the rule that
-funroll-loops takes full control of loop unrolling, and
munroll-only-small-loops just change its behavior to unroll small size
loops.

gcc/ChangeLog:

	PR target/107692
	* common/config/i386/i386-common.cc (ix86_optimization_table):
	Enable loop unroll O2, disable -fweb and -frename-registers
	by default.
	* config/i386/i386-options.cc
	(ix86_override_options_after_change):
	Disable small loop unroll when funroll-loops enabled, reset
	cunroll_grow_size when it is not explicitly enabled.
	(ix86_option_override_internal): Call
	ix86_override_options_after_change instead of calling
	ix86_recompute_optlev_based_flags and ix86_default_align
	separately.
	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
	factor if -munroll-only-small-loops enabled.
	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
	loop unrolling for -O2-speed.
	(pass_rtl_unroll_loops::execute): Rmove
	targetm.loop_unroll_adjust check.

gcc/testsuite/ChangeLog:

	PR target/107692
	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
	* gcc.target/i386/pr93002.c: Likewise.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8caf155a3d6e23e47bf55068ad23c23d4655a054
---
 ...-small-loop-unrolling-in-backend-PR-.patch | 231 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 234 insertions(+)
 create mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch

diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
new file mode 100644
index 0000000..de3995f
--- /dev/null
+++ b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
@@ -0,0 +1,231 @@
+From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Sat, 19 Nov 2022 09:38:00 +0800
+Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR
+ 107692]
+
+Followed by the discussion in pr107692, -munroll-only-small-loops
+Does not turns on/off -funroll-loops, and current check in
+pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
+effect. Revert the change about targetm.loop_unroll_adjust and apply
+the backend option change to strictly follow the rule that
+-funroll-loops takes full control of loop unrolling, and
+munroll-only-small-loops just change its behavior to unroll small size
+loops.
+
+gcc/ChangeLog:
+
+	PR target/107692
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable loop unroll O2, disable -fweb and -frename-registers
+	by default.
+	* config/i386/i386-options.cc
+	(ix86_override_options_after_change):
+	Disable small loop unroll when funroll-loops enabled, reset
+	cunroll_grow_size when it is not explicitly enabled.
+	(ix86_option_override_internal): Call
+	ix86_override_options_after_change instead of calling
+	ix86_recompute_optlev_based_flags and ix86_default_align
+	separately.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
+	loop unrolling for -O2-speed.
+	(pass_rtl_unroll_loops::execute): Rmove
+	targetm.loop_unroll_adjust check.
+
+gcc/testsuite/ChangeLog:
+
+	PR target/107692
+	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
+	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  8 ++++++
+ gcc/config/i386/i386-options.cc         | 34 ++++++++++++++++++++++---
+ gcc/config/i386/i386.cc                 | 18 ++++---------
+ gcc/loop-init.cc                        | 11 +++-----
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 --
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 7 files changed, 49 insertions(+), 28 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index e1c1fb07d8a..5e777849f91 100644
+--- a/gcc/common/config/i386/i386-common.cc
++++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
++
++    /* Turn on -funroll-loops with -munroll-only-small-loops to enable small
++       loop unrolling at -O2.  */
++    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
+     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
++    /* Turns off -frename-registers and -fweb which are enabled by
++       funroll-loops.  */
++    { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
++    { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 32cc58a764b..b853ff55825 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts,
+ void
+ ix86_override_options_after_change (void)
+ {
++  /* Default align_* from the processor table.  */
+   ix86_default_align (&global_options);
++
+   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
++
++  /* Disable unrolling small loops when there's explicit
++     -f{,no}unroll-loop.  */
++  if ((OPTION_SET_P (flag_unroll_loops))
++     || (OPTION_SET_P (flag_unroll_all_loops)
++	 && flag_unroll_all_loops))
++    {
++      if (!OPTION_SET_P (ix86_unroll_only_small_loops))
++	ix86_unroll_only_small_loops = 0;
++      /* Re-enable -frename-registers and -fweb if funroll-loops
++	 enabled.  */
++      if (!OPTION_SET_P (flag_web))
++	flag_web = flag_unroll_loops;
++      if (!OPTION_SET_P (flag_rename_registers))
++	flag_rename_registers = flag_unroll_loops;
++      /* -fcunroll-grow-size default follws -f[no]-unroll-loops.  */
++      if (!OPTION_SET_P (flag_cunroll_grow_size))
++	flag_cunroll_grow_size = flag_unroll_loops
++				 || flag_peel_loops
++				 || optimize >= 3;
++    }
++  else
++    {
++      if (!OPTION_SET_P (flag_cunroll_grow_size))
++	flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
++    }
++
+ }
+ 
+ /* Clear stack slot assignments remembered from previous functions.
+@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p,
+ 
+   set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes);
+ 
+-  ix86_recompute_optlev_based_flags (opts, opts_set);
++  ix86_override_options_after_change ();
+ 
+   ix86_tune_cost = processor_cost_table[ix86_tune];
+   /* TODO: ix86_cost should be chosen at instruction or function granuality
+@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p,
+       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
+     opts->x_ix86_regparm = REGPARM_MAX;
+ 
+-  /* Default align_* from the processor table.  */
+-  ix86_default_align (opts);
+-
+   /* Provide default for -mbranch-cost= value.  */
+   SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost,
+ 		       ix86_tune_cost->branch_cost);
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 39b2468799c..000415c0e2e 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+ 
+   /* Unroll small size loop when unroll factor is not explicitly
+      specified.  */
+-  if (!(flag_unroll_loops
+-	|| flag_unroll_all_loops
+-	|| loop->unroll))
++  if (ix86_unroll_only_small_loops && !loop->unroll)
+     {
+-      nunroll = 1;
+-
+-      /* Any explicit -f{no-}unroll-{all-}loops turns off
+-	 -munroll-only-small-loops.  */
+-      if (ix86_unroll_only_small_loops
+-	  && !OPTION_SET_P (flag_unroll_loops)
+-	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+-	nunroll = ix86_cost->small_unroll_factor;
+-
+-      return nunroll;
++      if (loop->ninsns <= ix86_cost->small_unroll_ninsns)
++	return MIN (nunroll, ix86_cost->small_unroll_factor);
++      else
++	return 1;
+     }
+ 
+   if (!TARGET_ADJUST_UNROLL)
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index 84336865ef7..ed1b2f6ebab 100644
+--- a/gcc/loop-init.cc
++++ b/gcc/loop-init.cc
+@@ -565,12 +565,10 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function * fun)
++  virtual bool gate (function *)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+-	      || (targetm.loop_unroll_adjust
+-		  && optimize >= 2
+-		  && optimize_function_for_speed_p (fun)));
++      return (flag_unroll_loops || flag_unroll_all_loops
++	      || cfun->has_unroll);
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops
+-	  || targetm.loop_unroll_adjust)
++      if (flag_unroll_loops)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index a32ea445a3f..1b1f6d32271 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,7 +1,5 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+-/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+-
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index cbc9fbb0450..98b012caf23 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
++++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
++/* { dg-options "-O2 -fno-unroll-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index f75a847f75d..7e2d869e17b 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
++++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
++/* { dg-options "-O2 -fno-unroll-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 2190a05..1302126 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -391,6 +391,7 @@ Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch
 Patch3022: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 Patch3023: 0026-Enable-small-loop-unrolling-for-O2.patch
+Patch3024: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -800,6 +801,7 @@ cd ..
 %patch3021 -p1 -b .dts-test-21~
 %patch3022 -p1 -b .dts-test-22~
 %patch3023 -p1 -b .dts-test-23~
+%patch3024 -p1 -b .dts-test-24~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2971,6 +2973,7 @@ fi
 - Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS
 - Add attribute hot judgement for INLINE_HINT_known_hot hint
 - Enable small loop unrolling for O2
+- i386: Only enable small loop unrolling in backend [PR 107692]
 
 * Fri Jul  8 2022 Marek Polacek <polacek@redhat.com> 12.1.1-3.2
 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789)
-- 
Gitee