From b7516c66adbc469e7b7e5b9564467630d917e7fd Mon Sep 17 00:00:00 2001
From: Renbo <rb01097748@alibaba-inc.com>
Date: Wed, 17 Jul 2024 16:37:46 +0800
Subject: [PATCH 1/6] update to gcc-toolset-12-gcc-12.2.1-7.6.src.rpm

Signed-off-by: Renbo <rb01097748@alibaba-inc.com>
---
 ...all-to-enable-AMX-for-latest-kernels.patch |  77 ---
 ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 -----
 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ------------------
 ...-small-loop-unrolling-in-backend-PR-.patch | 231 ---------
 ...hain-for-march-alderlake-and-sapphir.patch |  35 --
 dist                                          |   2 +-
 gcc.spec                                      |  26 +-
 gcc12-pr113960.patch                          | 107 ++++
 8 files changed, 120 insertions(+), 962 deletions(-)
 delete mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 delete mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 delete mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch
 delete mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 delete mode 100644 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch
 create mode 100644 gcc12-pr113960.patch

diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
deleted file mode 100644
index 94625b5..0000000
--- a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
+++ /dev/null
@@ -1,77 +0,0 @@
-From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001
-From: Haochen Jiang <haochen.jiang@intel.com>
-Date: Thu, 16 Jun 2022 00:15:53 -0700
-Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
-	New function to check if AMX is usable and enable AMX.
-	(main): Run test if AMX is usable.
----
- gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++
- 1 file changed, 30 insertions(+)
-
-diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
-index 434b0e59703..6fff5ff4631 100644
---- a/gcc/testsuite/gcc.target/i386/amx-check.h
-+++ b/gcc/testsuite/gcc.target/i386/amx-check.h
-@@ -4,11 +4,24 @@
- #include <stdlib.h>
- #include <string.h>
- #include <stdint.h>
-+#include <unistd.h>
-+#ifdef __linux__
-+#include <sys/syscall.h>
-+#endif
- #ifdef DEBUG
- #include <stdio.h>
- #endif
- #include "cpuid.h"
- 
-+#define XFEATURE_XTILECFG	17
-+#define XFEATURE_XTILEDATA	18
-+#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
-+#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
-+#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-+
-+#define ARCH_GET_XCOMP_PERM	0x1022
-+#define ARCH_REQ_XCOMP_PERM	0x1023
-+
- /* TODO: The tmm emulation is temporary for current
-    AMX implementation with no tmm regclass, should
-    be changed in the future. */
-@@ -44,6 +57,20 @@ typedef struct __tile
- /* Stride (colum width in byte) used for tileload/store */
- #define _STRIDE 64
- 
-+#ifdef __linux__
-+/* We need syscall to use amx functions */
-+int request_perm_xtile_data()
-+{
-+  unsigned long bitmask;
-+
-+  if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) ||
-+      syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask))
-+    return 0;
-+
-+  return (bitmask & XFEATURE_MASK_XTILE) != 0;
-+}
-+#endif
-+
- /* Initialize tile config by setting all tmm size to 16x64 */
- void init_tile_config (__tilecfg_u *dst)
- {
-@@ -185,6 +212,9 @@ main ()
- #endif
- #ifdef AMX_BF16
-       && __builtin_cpu_supports ("amx-bf16")
-+#endif
-+#ifdef __linux__
-+      && request_perm_xtile_data ()
- #endif
-       )
-     {
--- 
-2.18.2
-
diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
deleted file mode 100644
index 3e70f0c..0000000
--- a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+++ /dev/null
@@ -1,123 +0,0 @@
-From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001
-From: "Cui,Lili" <lili.cui@intel.com>
-Date: Tue, 1 Nov 2022 09:16:49 +0800
-Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint.
-
-We set up INLINE_HINT_known_hot hint only when we have profile feedback,
-now add function attribute judgement for it, when both caller and callee
-have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
-for it.
-
-With this patch applied,
-ADL Multi-copy:    538.imagic_r  16.7%
-ICX Multi-copy:    538.imagic_r  15.2%
-CLX Multi-copy:    538.imagic_r  12.7%
-Znver3 Multi-copy: 538.imagic_r  10.6%
-Arm Multi-copy:    538.imagic_r  13.4%
-
-gcc/ChangeLog
-
-	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
-	judgement for INLINE_HINT_known_hot hint.
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.dg/ipa/inlinehint-6.c: New test.
----
- gcc/ipa-inline-analysis.cc              | 13 ++++---
- gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++
- 2 files changed, 56 insertions(+), 4 deletions(-)
- create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-
-diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
-index 1ca685d1b0e..7bd29c36590 100644
---- a/gcc/ipa-inline-analysis.cc
-+++ b/gcc/ipa-inline-analysis.cc
-@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
- #include "ipa-utils.h"
- #include "cfgexpand.h"
- #include "gimplify.h"
-+#include "attribs.h"
- 
- /* Cached node/edge growths.  */
- fast_call_summary<edge_growth_cache_entry *, va_heap> *edge_growth_cache = NULL;
-@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
-       hints = estimates.hints;
-     }
- 
--  /* When we have profile feedback, we can quite safely identify hot
--     edges and for those we disable size limits.  Don't do that when
--     probability that caller will call the callee is low however, since it
-+  /* When we have profile feedback or function attribute, we can quite safely
-+     identify hot edges and for those we disable size limits.  Don't do that
-+     when probability that caller will call the callee is low however, since it
-      may hurt optimization of the caller's hot path.  */
--  if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
-+  if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
-       && (edge->count.ipa ().apply_scale (2, 1)
- 	  > (edge->caller->inlined_to
- 	     ? edge->caller->inlined_to->count.ipa ()
- 	     : edge->caller->count.ipa ())))
-+      || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl))
-+	  != NULL
-+	 && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl))
-+	  != NULL))
-     hints |= INLINE_HINT_known_hot;
- 
-   gcc_checking_assert (size >= 0);
-diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-new file mode 100644
-index 00000000000..1f3be641c6d
---- /dev/null
-+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
-@@ -0,0 +1,47 @@
-+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
-+/* { dg-add-options bind_pic_locally } */
-+
-+#define size_t long long int
-+
-+struct A
-+{
-+  size_t f1, f2, f3, f4;
-+};
-+struct C
-+{
-+  struct A a;
-+  size_t b;
-+};
-+struct C x;
-+
-+__attribute__((hot)) struct C callee (struct A *a, struct C *c)
-+{
-+  c->a=(*a);
-+
-+  if((c->b + 7) & 17)
-+   {
-+      c->a.f1 = c->a.f2 + c->a.f1;
-+      c->a.f2 = c->a.f3 - c->a.f2;
-+      c->a.f3 = c->a.f2 + c->a.f3;
-+      c->a.f4 = c->a.f2 - c->a.f4;
-+      c->b = c->a.f2;
-+
-+    }
-+  return *c;
-+}
-+
-+__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
-+{
-+  struct A a;
-+  a.f1 = 1 + d;
-+  a.f2 = e;
-+  a.f3 = 12 + f;
-+  a.f4 = 68 + g;
-+  if (c->b > 0)
-+    return callee (&a, c);
-+  else
-+    return *c;
-+}
-+
-+/* { dg-final { scan-ipa-dump "known_hot"  "inline"  } } */
-+
--- 
-2.18.2
-
diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch
deleted file mode 100644
index b16171b..0000000
--- a/0026-Enable-small-loop-unrolling-for-O2.patch
+++ /dev/null
@@ -1,481 +0,0 @@
-From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001
-From: Hongyu Wang <hongyu.wang@intel.com>
-Date: Thu, 8 Sep 2022 16:52:02 +0800
-Subject: [PATCH 4/5] Enable small loop unrolling for O2
-
-Modern processors has multiple way instruction decoders
-For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
-instructions (usually has 3 uops with a cmp/jmp pair that can be
-macro-fused), the decoder would have 2 uops bubble for each iteration
-and the pipeline could not be fully utilized.
-
-Therefore, this patch enables loop unrolling for small size loop at O2
-to fullfill the decoder as much as possible. It turns on rtl loop
-unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
-In x86 backend the default behavior is to unroll small loops with less
-than 4 insns by 1 time.
-
-This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
-0.9% codesize increment. For other benchmarks the variants are minor
-and overall codesize increased by 0.2%.
-
-The kernel image size increased by 0.06%, and no impact on eembc.
-
-gcc/ChangeLog:
-
-	* common/config/i386/i386-common.cc (ix86_optimization_table):
-	Enable small loop unroll at O2 by default.
-	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
-	factor if -munroll-only-small-loops enabled and -funroll-loops/
-	-funroll-all-loops are disabled.
-	* config/i386/i386.h (struct processor_costs): Add 2 field
-	small_unroll_ninsns and small_unroll_factor.
-	* config/i386/i386.opt: Add -munroll-only-small-loops.
-	* doc/invoke.texi: Document -munroll-only-small-loops.
-	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
-	loop unrolling for -O2-speed and above if target hook
-	loop_unroll_adjust exists.
-	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
-	when target hook loop_unroll_adjust exists.
-	* config/i386/x86-tune-costs.h: Update all processor costs
-	with small_unroll_ninsns = 4 and small_unroll_factor = 2.
-
-gcc/testsuite/ChangeLog:
-
-	* gcc.dg/guality/loop-1.c: Add additional option
-	-mno-unroll-only-small-loops.
-	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
-	* gcc.target/i386/pr93002.c: Likewise.
----
- gcc/common/config/i386/i386-common.cc   |  1 +
- gcc/config/i386/i386.cc                 | 18 ++++++++
- gcc/config/i386/i386.h                  |  5 +++
- gcc/config/i386/i386.opt                |  4 ++
- gcc/config/i386/x86-tune-costs.h        | 56 +++++++++++++++++++++++++
- gcc/doc/invoke.texi                     | 11 ++++-
- gcc/loop-init.cc                        | 10 +++--
- gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
- gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
- gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
- 10 files changed, 105 insertions(+), 6 deletions(-)
-
-diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
-index 07fdd045f30..e1c1fb07d8a 100644
---- a/gcc/common/config/i386/i386-common.cc
-+++ b/gcc/common/config/i386/i386-common.cc
-@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
-     /* The STC algorithm produces the smallest code at -Os, for x86.  */
-     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
-       REORDER_BLOCKS_ALGORITHM_STC },
-+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
-     /* Turn off -fschedule-insns by default.  It tends to make the
-        problem with not enough registers even worse.  */
-     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
-diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
-index b16df5b183e..39b2468799c 100644
---- a/gcc/config/i386/i386.cc
-+++ b/gcc/config/i386/i386.cc
-@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
-   unsigned i;
-   unsigned mem_count = 0;
- 
-+  /* Unroll small size loop when unroll factor is not explicitly
-+     specified.  */
-+  if (!(flag_unroll_loops
-+	|| flag_unroll_all_loops
-+	|| loop->unroll))
-+    {
-+      nunroll = 1;
-+
-+      /* Any explicit -f{no-}unroll-{all-}loops turns off
-+	 -munroll-only-small-loops.  */
-+      if (ix86_unroll_only_small_loops
-+	  && !OPTION_SET_P (flag_unroll_loops)
-+	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
-+	nunroll = ix86_cost->small_unroll_factor;
-+
-+      return nunroll;
-+    }
-+
-   if (!TARGET_ADJUST_UNROLL)
-      return nunroll;
- 
-diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
-index a61c32b8957..421801111a7 100644
---- a/gcc/config/i386/i386.h
-+++ b/gcc/config/i386/i386.h
-@@ -219,6 +219,11 @@ struct processor_costs {
-   const char *const align_jump;		/* Jump alignment.  */
-   const char *const align_label;	/* Label alignment.  */
-   const char *const align_func;		/* Function alignment.  */
-+
-+  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
-+					   to be unrolled.  */
-+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
-+					   be unrolled.  */
- };
- 
- extern const struct processor_costs *ix86_cost;
-diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
-index a6b0e28f238..3d369647bf7 100644
---- a/gcc/config/i386/i386.opt
-+++ b/gcc/config/i386/i386.opt
-@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
- -param=x86-stlf-window-ninsns=
- Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
- Instructions number above which STFL stall penalty can be compensated.
-+
-+munroll-only-small-loops
-+Target Var(ix86_unroll_only_small_loops) Init(0) Save
-+Enable conservative small loop unrolling.
-diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
-index 017ffa69958..b4303e4e971 100644
---- a/gcc/config/i386/x86-tune-costs.h
-+++ b/gcc/config/i386/x86-tune-costs.h
-@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* Processor costs (relative to an add) */
-@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
-   "4",					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   "4",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs i486_memcpy[2] = {
-@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs pentium_memcpy[2] = {
-@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static const
-@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
-@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs geode_memcpy[2] = {
-@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs k6_memcpy[2] = {
-@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
-   "32:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "32",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* For some reason, Athlon deals better with REP prefix (relative to loops)
-@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* K8 has optimized REP instruction for medium sized blocks, but for very
-@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
-@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
-   "32:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "32",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /*  BDVER has optimized REP instruction for medium sized blocks, but for
-@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- 
-@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
-@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- struct processor_costs znver3_cost = {
-@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
-   "16",					/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
-@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* icelake_cost should produce code tuned for Icelake family of CPUs.
-@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
-@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
-   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
-@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs btver2_memcpy[2] = {
-@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "11",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs pentium4_memcpy[2] = {
-@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs nocona_memcpy[2] = {
-@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = {
-   NULL,					/* Jump alignment.  */
-   NULL,					/* Label alignment.  */
-   NULL,					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs atom_memcpy[2] = {
-@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs slm_memcpy[2] = {
-@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs tremont_memcpy[2] = {
-@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- static stringop_algs intel_memcpy[2] = {
-@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = {
-   "16:8:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* Generic should produce code tuned for Core-i7 (and newer chips)
-@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
- /* core_cost should produce code tuned for Core familly of CPUs.  */
-@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = {
-   "16:11:8",				/* Jump alignment.  */
-   "0:0:8",				/* Label alignment.  */
-   "16",					/* Func alignment.  */
-+  4,					/* Small unroll limit.  */
-+  2,					/* Small unroll factor.  */
- };
- 
-diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
-index 9ac7f89ebb1..1961cafa2bb 100644
---- a/gcc/doc/invoke.texi
-+++ b/gcc/doc/invoke.texi
-@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options.
- -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
- -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
- -mindirect-branch-register -mharden-sls=@var{choice} @gol
---mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
-+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
-+-munroll-only-small-loops}
- 
- @emph{x86 Windows Options}
- @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
-@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols.  The default is
- @option{-mno-direct-extern-access} and executable compiled with
- @option{-mdirect-extern-access} may not be binary compatible if
- protected symbols are used in shared libraries and executable.
-+
-+@item -munroll-only-small-loops
-+@opindex munroll-only-small-loops
-+@opindex mno-unroll-only-small-loops
-+Controls conservative small loop unrolling. It is default enabled by
-+O2, and unrolls loop with less than 4 insns by 1 time. Explicit
-+-f[no-]unroll-[all-]loops would disable this flag to avoid any
-+unintended unrolling behavior that user does not want.
- @end table
- 
- @node x86 Windows Options
-diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
-index 1e4f6cfd7fb..84336865ef7 100644
---- a/gcc/loop-init.cc
-+++ b/gcc/loop-init.cc
-@@ -565,9 +565,12 @@ public:
-   {}
- 
-   /* opt_pass methods: */
--  virtual bool gate (function *)
-+  virtual bool gate (function * fun)
-     {
--      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
-+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
-+	      || (targetm.loop_unroll_adjust
-+		  && optimize >= 2
-+		  && optimize_function_for_speed_p (fun)));
-     }
- 
-   virtual unsigned int execute (function *);
-@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
-       if (dump_file)
- 	df_dump (dump_file);
- 
--      if (flag_unroll_loops)
-+      if (flag_unroll_loops
-+	  || targetm.loop_unroll_adjust)
- 	flags |= UAP_UNROLL;
-       if (flag_unroll_all_loops)
- 	flags |= UAP_UNROLL_ALL;
-diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
-index 1b1f6d32271..a32ea445a3f 100644
---- a/gcc/testsuite/gcc.dg/guality/loop-1.c
-+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
-@@ -1,5 +1,7 @@
- /* { dg-do run } */
- /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
-+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
-+
- 
- #include "../nop.h"
- 
-diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
-index 81841ef5bd7..cbc9fbb0450 100644
---- a/gcc/testsuite/gcc.target/i386/pr86270.c
-+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
- 
- int *a;
- long len;
-diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
-index 0248fcc00a5..f75a847f75d 100644
---- a/gcc/testsuite/gcc.target/i386/pr93002.c
-+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
-@@ -1,6 +1,6 @@
- /* PR target/93002 */
- /* { dg-do compile } */
--/* { dg-options "-O2" } */
-+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
- /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
- 
- volatile int sink;
--- 
-2.18.2
-
diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
deleted file mode 100644
index de3995f..0000000
--- a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
+++ /dev/null
@@ -1,231 +0,0 @@
-From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001
-From: Hongyu Wang <hongyu.wang@intel.com>
-Date: Sat, 19 Nov 2022 09:38:00 +0800
-Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR
- 107692]
-
-Followed by the discussion in pr107692, -munroll-only-small-loops
-Does not turns on/off -funroll-loops, and current check in
-pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
-effect. Revert the change about targetm.loop_unroll_adjust and apply
-the backend option change to strictly follow the rule that
--funroll-loops takes full control of loop unrolling, and
-munroll-only-small-loops just change its behavior to unroll small size
-loops.
-
-gcc/ChangeLog:
-
-	PR target/107692
-	* common/config/i386/i386-common.cc (ix86_optimization_table):
-	Enable loop unroll O2, disable -fweb and -frename-registers
-	by default.
-	* config/i386/i386-options.cc
-	(ix86_override_options_after_change):
-	Disable small loop unroll when funroll-loops enabled, reset
-	cunroll_grow_size when it is not explicitly enabled.
-	(ix86_option_override_internal): Call
-	ix86_override_options_after_change instead of calling
-	ix86_recompute_optlev_based_flags and ix86_default_align
-	separately.
-	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
-	factor if -munroll-only-small-loops enabled.
-	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
-	loop unrolling for -O2-speed.
-	(pass_rtl_unroll_loops::execute): Rmove
-	targetm.loop_unroll_adjust check.
-
-gcc/testsuite/ChangeLog:
-
-	PR target/107692
-	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
-	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
-	* gcc.target/i386/pr93002.c: Likewise.
----
- gcc/common/config/i386/i386-common.cc   |  8 ++++++
- gcc/config/i386/i386-options.cc         | 34 ++++++++++++++++++++++---
- gcc/config/i386/i386.cc                 | 18 ++++---------
- gcc/loop-init.cc                        | 11 +++-----
- gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 --
- gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
- gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
- 7 files changed, 49 insertions(+), 28 deletions(-)
-
-diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
-index e1c1fb07d8a..5e777849f91 100644
---- a/gcc/common/config/i386/i386-common.cc
-+++ b/gcc/common/config/i386/i386-common.cc
-@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] =
-     /* The STC algorithm produces the smallest code at -Os, for x86.  */
-     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
-       REORDER_BLOCKS_ALGORITHM_STC },
-+
-+    /* Turn on -funroll-loops with -munroll-only-small-loops to enable small
-+       loop unrolling at -O2.  */
-+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
-     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
-+    /* Turns off -frename-registers and -fweb which are enabled by
-+       funroll-loops.  */
-+    { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
-+    { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
-     /* Turn off -fschedule-insns by default.  It tends to make the
-        problem with not enough registers even worse.  */
-     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
-diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
-index 32cc58a764b..b853ff55825 100644
---- a/gcc/config/i386/i386-options.cc
-+++ b/gcc/config/i386/i386-options.cc
-@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts,
- void
- ix86_override_options_after_change (void)
- {
-+  /* Default align_* from the processor table.  */
-   ix86_default_align (&global_options);
-+
-   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
-+
-+  /* Disable unrolling small loops when there's explicit
-+     -f{,no}unroll-loop.  */
-+  if ((OPTION_SET_P (flag_unroll_loops))
-+     || (OPTION_SET_P (flag_unroll_all_loops)
-+	 && flag_unroll_all_loops))
-+    {
-+      if (!OPTION_SET_P (ix86_unroll_only_small_loops))
-+	ix86_unroll_only_small_loops = 0;
-+      /* Re-enable -frename-registers and -fweb if funroll-loops
-+	 enabled.  */
-+      if (!OPTION_SET_P (flag_web))
-+	flag_web = flag_unroll_loops;
-+      if (!OPTION_SET_P (flag_rename_registers))
-+	flag_rename_registers = flag_unroll_loops;
-+      /* -fcunroll-grow-size default follws -f[no]-unroll-loops.  */
-+      if (!OPTION_SET_P (flag_cunroll_grow_size))
-+	flag_cunroll_grow_size = flag_unroll_loops
-+				 || flag_peel_loops
-+				 || optimize >= 3;
-+    }
-+  else
-+    {
-+      if (!OPTION_SET_P (flag_cunroll_grow_size))
-+	flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
-+    }
-+
- }
- 
- /* Clear stack slot assignments remembered from previous functions.
-@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p,
- 
-   set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes);
- 
--  ix86_recompute_optlev_based_flags (opts, opts_set);
-+  ix86_override_options_after_change ();
- 
-   ix86_tune_cost = processor_cost_table[ix86_tune];
-   /* TODO: ix86_cost should be chosen at instruction or function granuality
-@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p,
-       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
-     opts->x_ix86_regparm = REGPARM_MAX;
- 
--  /* Default align_* from the processor table.  */
--  ix86_default_align (opts);
--
-   /* Provide default for -mbranch-cost= value.  */
-   SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost,
- 		       ix86_tune_cost->branch_cost);
-diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
-index 39b2468799c..000415c0e2e 100644
---- a/gcc/config/i386/i386.cc
-+++ b/gcc/config/i386/i386.cc
-@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
- 
-   /* Unroll small size loop when unroll factor is not explicitly
-      specified.  */
--  if (!(flag_unroll_loops
--	|| flag_unroll_all_loops
--	|| loop->unroll))
-+  if (ix86_unroll_only_small_loops && !loop->unroll)
-     {
--      nunroll = 1;
--
--      /* Any explicit -f{no-}unroll-{all-}loops turns off
--	 -munroll-only-small-loops.  */
--      if (ix86_unroll_only_small_loops
--	  && !OPTION_SET_P (flag_unroll_loops)
--	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
--	nunroll = ix86_cost->small_unroll_factor;
--
--      return nunroll;
-+      if (loop->ninsns <= ix86_cost->small_unroll_ninsns)
-+	return MIN (nunroll, ix86_cost->small_unroll_factor);
-+      else
-+	return 1;
-     }
- 
-   if (!TARGET_ADJUST_UNROLL)
-diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
-index 84336865ef7..ed1b2f6ebab 100644
---- a/gcc/loop-init.cc
-+++ b/gcc/loop-init.cc
-@@ -565,12 +565,10 @@ public:
-   {}
- 
-   /* opt_pass methods: */
--  virtual bool gate (function * fun)
-+  virtual bool gate (function *)
-     {
--      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
--	      || (targetm.loop_unroll_adjust
--		  && optimize >= 2
--		  && optimize_function_for_speed_p (fun)));
-+      return (flag_unroll_loops || flag_unroll_all_loops
-+	      || cfun->has_unroll);
-     }
- 
-   virtual unsigned int execute (function *);
-@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun)
-       if (dump_file)
- 	df_dump (dump_file);
- 
--      if (flag_unroll_loops
--	  || targetm.loop_unroll_adjust)
-+      if (flag_unroll_loops)
- 	flags |= UAP_UNROLL;
-       if (flag_unroll_all_loops)
- 	flags |= UAP_UNROLL_ALL;
-diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
-index a32ea445a3f..1b1f6d32271 100644
---- a/gcc/testsuite/gcc.dg/guality/loop-1.c
-+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
-@@ -1,7 +1,5 @@
- /* { dg-do run } */
- /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
--/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
--
- 
- #include "../nop.h"
- 
-diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
-index cbc9fbb0450..98b012caf23 100644
---- a/gcc/testsuite/gcc.target/i386/pr86270.c
-+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
-@@ -1,5 +1,5 @@
- /* { dg-do compile } */
--/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
-+/* { dg-options "-O2 -fno-unroll-loops" } */
- 
- int *a;
- long len;
-diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
-index f75a847f75d..7e2d869e17b 100644
---- a/gcc/testsuite/gcc.target/i386/pr93002.c
-+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
-@@ -1,6 +1,6 @@
- /* PR target/93002 */
- /* { dg-do compile } */
--/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
-+/* { dg-options "-O2 -fno-unroll-loops" } */
- /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
- 
- volatile int sink;
--- 
-2.18.2
-
diff --git a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch
deleted file mode 100644
index ad65965..0000000
--- a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From b7980cd8d8bcf41b3ca1b6f3ba147789d42a9b99 Mon Sep 17 00:00:00 2001
-From: Hongyu Wang <hongyu.wang@intel.com>
-Date: Tue, 6 Dec 2022 09:53:35 +0800
-Subject: [PATCH] i386: Avoid fma_chain for -march=alderlake and
- sapphirerapids.
-
-For Alderlake there is similar issue like PR 81616, enable
-avoid_fma256_chain will also benefit on Intel latest platforms
-Alderlake and Sapphire Rapids.
-
-gcc/ChangeLog:
-
-	* config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add
-	m_SAPPHIRERAPIDS, m_ALDERLAKE.
----
- gcc/config/i386/x86-tune.def | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
-index d983e2f6213..1e1b206a71c 100644
---- a/gcc/config/i386/x86-tune.def
-+++ b/gcc/config/i386/x86-tune.def
-@@ -485,7 +485,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
- 
- /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
-    smaller FMA chain.  */
--DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3)
-+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
-+	  | m_ALDERLAKE | m_SAPPHIRERAPIDS)
- 
- /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
-    for v2df vector reduction.  */
--- 
-2.18.2
-
diff --git a/dist b/dist
index 9c0e36e..1fe92cf 100644
--- a/dist
+++ b/dist
@@ -1 +1 @@
-an8
+an8_10
diff --git a/gcc.spec b/gcc.spec
index 4622976..b7955aa 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -1,4 +1,6 @@
-%define anolis_release .0.1
+%{?scl_package:%global scl gcc-toolset-12}
+%global scl_prefix gcc-toolset-12-
+BuildRequires: scl-utils-build
 %global __python /usr/bin/python3
 %{?scl:%global __strip %%{_scl_root}/usr/bin/strip}
 %{?scl:%global __objdump %%{_scl_root}/usr/bin/objdump}
@@ -148,7 +150,7 @@
 Summary: GCC version 12
 Name: %{?scl_prefix}gcc
 Version: %{gcc_version}
-Release: %{gcc_release}.4%{anolis_release}%{?dist}
+Release: %{gcc_release}.6%{?dist}
 # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have
 # GCC Runtime Exception.
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
@@ -194,10 +196,8 @@ URL: http://gcc.gnu.org
 # Need binutils which support -plugin
 # Need binutils which support .loc view >= 2.30
 # Need binutils which support --generate-missing-build-notes=yes >= 2.31
-%if 0%{?scl:1}
 BuildRequires: %{?scl_prefix}binutils >= 2.31
 BuildRequires: %{?scl_prefix}gdb >= 7.4.50
-%endif
 # While gcc doesn't include statically linked binaries, during testing
 # -static is used several times.
 BuildRequires: glibc-static
@@ -352,6 +352,7 @@ Patch11: gcc12-d-shared-libphobos.patch
 Patch12: gcc12-pr107468.patch
 Patch15: gcc12-static-libquadmath.patch
 Patch16: gcc12-FMA-chains.patch
+Patch17: gcc12-pr113960.patch
 
 Patch100: gcc12-fortran-fdec-duplicates.patch
 Patch101: gcc12-fortran-flogical-as-integer.patch
@@ -388,9 +389,6 @@ Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: gcc12-testsuite-typo.patch
-Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
-Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch
-Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -735,6 +733,7 @@ so that there cannot be any synchronization problems.
 %patch12 -p0 -b .pr107468~
 %patch15 -p0 -b .static-libquadmath~
 %patch16 -p1 -b .fma~
+%patch17 -p1 -b .pr113960~
 
 %if 0%{?rhel} >= 6
 %patch100 -p1 -b .fortran-fdec-duplicates~
@@ -800,9 +799,6 @@ cd ..
 %patch3019 -p1 -b .dts-test-19~
 %endif
 %patch3020 -p1 -b .typo
-%patch5001 -p1 -b .dts-test-22~
-%patch5002 -p1 -b .dts-test-23~
-%patch5003 -p1 -b .dts-test-24~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2997,10 +2993,12 @@ fi
 %endif
 
 %changelog
-* Tue May 30 2023 Haochen Jiang <haochen.jiang@intel.com> 12.2.1-7.4.0.1
-- Add attribute hot judgement for INLINE_HINT_known_hot hint
-- Enable small loop unrolling for O2
-- i386: Only enable small loop unrolling in backend [PR 107692]
+* Wed Apr  3 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.6
+- bump NVR (RHEL-31253)
+
+* Tue Mar 26 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.5
+- fix conditions for using memcmp in
+  std::lexicographical_compare_three_way (PR libstdc++/113960, RHEL-29952)
 
 * Fri Feb 10 2023 Marek Polacek <polacek@redhat.com> 12.2.1-7.4
 - avoid fma_chain for -march=alderlake and sapphirerapids (#2168917)
diff --git a/gcc12-pr113960.patch b/gcc12-pr113960.patch
new file mode 100644
index 0000000..98d2f86
--- /dev/null
+++ b/gcc12-pr113960.patch
@@ -0,0 +1,107 @@
+commit 6f5dcea85a31845ec6f4b6886734b0f02e013718
+Author: Jonathan Wakely <jwakely@redhat.com>
+Date:   Tue Feb 27 17:50:34 2024 +0000
+
+    libstdc++: Fix conditions for using memcmp in std::lexicographical_compare_three_way [PR113960]
+    
+    The change in r11-2981-g2f983fa69005b6 meant that
+    std::lexicographical_compare_three_way started to use memcmp for
+    unsigned integers on big endian targets, but for that to be valid we
+    need the two value types to have the same size and we need to use that
+    size to compute the length passed to memcmp.
+    
+    I already defined a __is_memcmp_ordered_with trait that does the right
+    checks, std::lexicographical_compare_three_way just needs to use it.
+    
+    libstdc++-v3/ChangeLog:
+    
+            PR libstdc++/113960
+            * include/bits/stl_algobase.h (__is_byte_iter): Replace with ...
+            (__memcmp_ordered_with): New concept.
+            (lexicographical_compare_three_way): Use __memcmp_ordered_with
+            instead of __is_byte_iter. Use correct length for memcmp.
+            * testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc:
+            New test.
+    
+    (cherry picked from commit f5cdda8acb06c20335855ed353ab9a441c12128a)
+
+diff --git a/libstdc++-v3/include/bits/stl_algobase.h b/libstdc++-v3/include/bits/stl_algobase.h
+index 7664301a208..6e648e48ad0 100644
+--- a/libstdc++-v3/include/bits/stl_algobase.h
++++ b/libstdc++-v3/include/bits/stl_algobase.h
+@@ -1780,11 +1780,14 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO
+     }
+ 
+ #if __cpp_lib_three_way_comparison
+-  // Iter points to a contiguous range of unsigned narrow character type
+-  // or std::byte, suitable for comparison by memcmp.
+-  template<typename _Iter>
+-    concept __is_byte_iter = contiguous_iterator<_Iter>
+-      && __is_memcmp_ordered<iter_value_t<_Iter>>::__value;
++  // Both iterators refer to contiguous ranges of unsigned narrow characters,
++  // or std::byte, or big-endian unsigned integers, suitable for comparison
++  // using memcmp.
++  template<typename _Iter1, typename _Iter2>
++    concept __memcmp_ordered_with
++      = (__is_memcmp_ordered_with<iter_value_t<_Iter1>,
++				  iter_value_t<_Iter2>>::__value)
++	  && contiguous_iterator<_Iter1> && contiguous_iterator<_Iter2>;
+ 
+   // Return a struct with two members, initialized to the smaller of x and y
+   // (or x if they compare equal) and the result of the comparison x <=> y.
+@@ -1834,20 +1837,20 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO
+       if (!std::__is_constant_evaluated())
+ 	if constexpr (same_as<_Comp, __detail::_Synth3way>
+ 		      || same_as<_Comp, compare_three_way>)
+-	  if constexpr (__is_byte_iter<_InputIter1>)
+-	    if constexpr (__is_byte_iter<_InputIter2>)
+-	      {
+-		const auto [__len, __lencmp] = _GLIBCXX_STD_A::
+-		  __min_cmp(__last1 - __first1, __last2 - __first2);
+-		if (__len)
+-		  {
+-		    const auto __c
+-		      = __builtin_memcmp(&*__first1, &*__first2, __len) <=> 0;
+-		    if (__c != 0)
+-		      return __c;
+-		  }
+-		return __lencmp;
+-	      }
++	  if constexpr (__memcmp_ordered_with<_InputIter1, _InputIter2>)
++	    {
++	      const auto [__len, __lencmp] = _GLIBCXX_STD_A::
++		__min_cmp(__last1 - __first1, __last2 - __first2);
++	      if (__len)
++		{
++		  const auto __blen = __len * sizeof(*__first1);
++		  const auto __c
++		    = __builtin_memcmp(&*__first1, &*__first2, __blen) <=> 0;
++		  if (__c != 0)
++		    return __c;
++		}
++	      return __lencmp;
++	    }
+ 
+       while (__first1 != __last1)
+ 	{
+diff --git a/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc b/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc
+new file mode 100644
+index 00000000000..d51ae1a3d50
+--- /dev/null
++++ b/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc
+@@ -0,0 +1,15 @@
++// { dg-do run { target c++20 } }
++
++// PR libstdc++/113960
++// std::map with std::vector as input overwrites itself with c++20, on s390x
++
++#include <algorithm>
++#include <testsuite_hooks.h>
++
++int main()
++{
++  unsigned short a1[] { 1, 2, 3 };
++  unsigned short a2[] { 1, 2, 4 };
++  // Incorrect memcmp comparison for big endian targets.
++  VERIFY( std::lexicographical_compare_three_way(a1, a1+3, a2, a2+3) < 0 );
++}
-- 
Gitee


From c7167efd4293aa18e9f56fd57bea8cad688880ea Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Thu, 10 Nov 2022 09:40:26 +0800
Subject: [PATCH 2/6] i386: Add syscall to enable AMX for latest kernels

gcc/testsuite/ChangeLog:

	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
	New function to check if AMX is usable and enable AMX.
	(main): Run test if AMX is usable.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5e377d21f1f345d8b157b9bc306e02bb9bd45e01
---
 ...all-to-enable-AMX-for-latest-kernels.patch | 77 +++++++++++++++++++
 gcc.spec                                      |  8 +-
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch

diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
new file mode 100644
index 0000000..94625b5
--- /dev/null
+++ b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
@@ -0,0 +1,77 @@
+From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001
+From: Haochen Jiang <haochen.jiang@intel.com>
+Date: Thu, 16 Jun 2022 00:15:53 -0700
+Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/amx-check.h (request_perm_xtile_data):
+	New function to check if AMX is usable and enable AMX.
+	(main): Run test if AMX is usable.
+---
+ gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
+index 434b0e59703..6fff5ff4631 100644
+--- a/gcc/testsuite/gcc.target/i386/amx-check.h
++++ b/gcc/testsuite/gcc.target/i386/amx-check.h
+@@ -4,11 +4,24 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <stdint.h>
++#include <unistd.h>
++#ifdef __linux__
++#include <sys/syscall.h>
++#endif
+ #ifdef DEBUG
+ #include <stdio.h>
+ #endif
+ #include "cpuid.h"
+ 
++#define XFEATURE_XTILECFG	17
++#define XFEATURE_XTILEDATA	18
++#define XFEATURE_MASK_XTILECFG	(1 << XFEATURE_XTILECFG)
++#define XFEATURE_MASK_XTILEDATA	(1 << XFEATURE_XTILEDATA)
++#define XFEATURE_MASK_XTILE	(XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
++
++#define ARCH_GET_XCOMP_PERM	0x1022
++#define ARCH_REQ_XCOMP_PERM	0x1023
++
+ /* TODO: The tmm emulation is temporary for current
+    AMX implementation with no tmm regclass, should
+    be changed in the future. */
+@@ -44,6 +57,20 @@ typedef struct __tile
+ /* Stride (colum width in byte) used for tileload/store */
+ #define _STRIDE 64
+ 
++#ifdef __linux__
++/* We need syscall to use amx functions */
++int request_perm_xtile_data()
++{
++  unsigned long bitmask;
++
++  if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) ||
++      syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask))
++    return 0;
++
++  return (bitmask & XFEATURE_MASK_XTILE) != 0;
++}
++#endif
++
+ /* Initialize tile config by setting all tmm size to 16x64 */
+ void init_tile_config (__tilecfg_u *dst)
+ {
+@@ -185,6 +212,9 @@ main ()
+ #endif
+ #ifdef AMX_BF16
+       && __builtin_cpu_supports ("amx-bf16")
++#endif
++#ifdef __linux__
++      && request_perm_xtile_data ()
+ #endif
+       )
+     {
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index b7955aa..562650f 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -1,3 +1,4 @@
+%define anolis_release .0.1
 %{?scl_package:%global scl gcc-toolset-12}
 %global scl_prefix gcc-toolset-12-
 BuildRequires: scl-utils-build
@@ -150,7 +151,7 @@ BuildRequires: scl-utils-build
 Summary: GCC version 12
 Name: %{?scl_prefix}gcc
 Version: %{gcc_version}
-Release: %{gcc_release}.6%{?dist}
+Release: %{gcc_release}.6%{anolis_release}%{?dist}
 # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have
 # GCC Runtime Exception.
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
@@ -389,6 +390,7 @@ Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: gcc12-testsuite-typo.patch
+Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -799,6 +801,7 @@ cd ..
 %patch3019 -p1 -b .dts-test-19~
 %endif
 %patch3020 -p1 -b .typo
+%patch5000 -p1 -b .dts-test-20~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2993,6 +2996,9 @@ fi
 %endif
 
 %changelog
+* Wed Jul 17 2024 Haochen Jiang <haochen.jiang@intel.com> 12.2.1-7.6.0.1
+- i386: Add syscall to enable AMX for latest kernels
+
 * Wed Apr  3 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.6
 - bump NVR (RHEL-31253)
 
-- 
Gitee


From fc63f978f187017c26dd4840481f1aa9aef4cdd9 Mon Sep 17 00:00:00 2001
From: "Cui,Lili" <lili.cui@intel.com>
Date: Thu, 10 Nov 2022 09:48:30 +0800
Subject: [PATCH 3/6] Add attribute hot judgement for INLINE_HINT_known_hot
 hint.

We set up INLINE_HINT_known_hot hint only when we have profile feedback,
now add function attribute judgement for it, when both caller and callee
have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
for it.

With this patch applied,
ADL Multi-copy:    538.imagic_r  16.7%
ICX Multi-copy:    538.imagic_r  15.2%
CLX Multi-copy:    538.imagic_r  12.7%
Znver3 Multi-copy: 538.imagic_r  10.6%
Arm Multi-copy:    538.imagic_r  13.4%

gcc/ChangeLog

	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
	judgement for INLINE_HINT_known_hot hint.

gcc/testsuite/ChangeLog:

	* gcc.dg/ipa/inlinehint-6.c: New test.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a
---
 ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 126 insertions(+)
 create mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch

diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
new file mode 100644
index 0000000..3e70f0c
--- /dev/null
+++ b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
@@ -0,0 +1,123 @@
+From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001
+From: "Cui,Lili" <lili.cui@intel.com>
+Date: Tue, 1 Nov 2022 09:16:49 +0800
+Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint.
+
+We set up INLINE_HINT_known_hot hint only when we have profile feedback,
+now add function attribute judgement for it, when both caller and callee
+have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
+for it.
+
+With this patch applied,
+ADL Multi-copy:    538.imagic_r  16.7%
+ICX Multi-copy:    538.imagic_r  15.2%
+CLX Multi-copy:    538.imagic_r  12.7%
+Znver3 Multi-copy: 538.imagic_r  10.6%
+Arm Multi-copy:    538.imagic_r  13.4%
+
+gcc/ChangeLog
+
+	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
+	judgement for INLINE_HINT_known_hot hint.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/ipa/inlinehint-6.c: New test.
+---
+ gcc/ipa-inline-analysis.cc              | 13 ++++---
+ gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++
+ 2 files changed, 56 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+
+diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
+index 1ca685d1b0e..7bd29c36590 100644
+--- a/gcc/ipa-inline-analysis.cc
++++ b/gcc/ipa-inline-analysis.cc
+@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ipa-utils.h"
+ #include "cfgexpand.h"
+ #include "gimplify.h"
++#include "attribs.h"
+ 
+ /* Cached node/edge growths.  */
+ fast_call_summary<edge_growth_cache_entry *, va_heap> *edge_growth_cache = NULL;
+@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
+       hints = estimates.hints;
+     }
+ 
+-  /* When we have profile feedback, we can quite safely identify hot
+-     edges and for those we disable size limits.  Don't do that when
+-     probability that caller will call the callee is low however, since it
++  /* When we have profile feedback or function attribute, we can quite safely
++     identify hot edges and for those we disable size limits.  Don't do that
++     when probability that caller will call the callee is low however, since it
+      may hurt optimization of the caller's hot path.  */
+-  if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
++  if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
+       && (edge->count.ipa ().apply_scale (2, 1)
+ 	  > (edge->caller->inlined_to
+ 	     ? edge->caller->inlined_to->count.ipa ()
+ 	     : edge->caller->count.ipa ())))
++      || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl))
++	  != NULL
++	 && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl))
++	  != NULL))
+     hints |= INLINE_HINT_known_hot;
+ 
+   gcc_checking_assert (size >= 0);
+diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+new file mode 100644
+index 00000000000..1f3be641c6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+@@ -0,0 +1,47 @@
++/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
++/* { dg-add-options bind_pic_locally } */
++
++#define size_t long long int
++
++struct A
++{
++  size_t f1, f2, f3, f4;
++};
++struct C
++{
++  struct A a;
++  size_t b;
++};
++struct C x;
++
++__attribute__((hot)) struct C callee (struct A *a, struct C *c)
++{
++  c->a=(*a);
++
++  if((c->b + 7) & 17)
++   {
++      c->a.f1 = c->a.f2 + c->a.f1;
++      c->a.f2 = c->a.f3 - c->a.f2;
++      c->a.f3 = c->a.f2 + c->a.f3;
++      c->a.f4 = c->a.f2 - c->a.f4;
++      c->b = c->a.f2;
++
++    }
++  return *c;
++}
++
++__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
++{
++  struct A a;
++  a.f1 = 1 + d;
++  a.f2 = e;
++  a.f3 = 12 + f;
++  a.f4 = 68 + g;
++  if (c->b > 0)
++    return callee (&a, c);
++  else
++    return *c;
++}
++
++/* { dg-final { scan-ipa-dump "known_hot"  "inline"  } } */
++
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 562650f..15b721f 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -391,6 +391,7 @@ Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: gcc12-testsuite-typo.patch
 Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
+Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -802,6 +803,7 @@ cd ..
 %endif
 %patch3020 -p1 -b .typo
 %patch5000 -p1 -b .dts-test-20~
+%patch5001 -p1 -b .dts-test-22~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -2998,6 +3000,7 @@ fi
 %changelog
 * Wed Jul 17 2024 Haochen Jiang <haochen.jiang@intel.com> 12.2.1-7.6.0.1
 - i386: Add syscall to enable AMX for latest kernels
+- Add attribute hot judgement for INLINE_HINT_known_hot hint
 
 * Wed Apr  3 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.6
 - bump NVR (RHEL-31253)
-- 
Gitee


From f800c4e779e38e12a039c07f6fe0e3170e02eec7 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Tue, 29 Nov 2022 14:18:12 +0800
Subject: [PATCH 4/6] Enable small loop unrolling for O2

Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

	* common/config/i386/i386-common.cc (ix86_optimization_table):
	Enable small loop unroll at O2 by default.
	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
	factor if -munroll-only-small-loops enabled and -funroll-loops/
	-funroll-all-loops are disabled.
	* config/i386/i386.h (struct processor_costs): Add 2 field
	small_unroll_ninsns and small_unroll_factor.
	* config/i386/i386.opt: Add -munroll-only-small-loops.
	* doc/invoke.texi: Document -munroll-only-small-loops.
	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
	loop unrolling for -O2-speed and above if target hook
	loop_unroll_adjust exists.
	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
	when target hook loop_unroll_adjust exists.
	* config/i386/x86-tune-costs.h: Update all processor costs
	with small_unroll_ninsns = 4 and small_unroll_factor = 2.

gcc/testsuite/ChangeLog:

	* gcc.dg/guality/loop-1.c: Add additional option
	-mno-unroll-only-small-loops.
	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
	* gcc.target/i386/pr93002.c: Likewise.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=071e428c24ee8c1ed062597a093708bba29509c9
---
 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 484 insertions(+)
 create mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch

diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch
new file mode 100644
index 0000000..b16171b
--- /dev/null
+++ b/0026-Enable-small-loop-unrolling-for-O2.patch
@@ -0,0 +1,481 @@
+From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Thu, 8 Sep 2022 16:52:02 +0800
+Subject: [PATCH 4/5] Enable small loop unrolling for O2
+
+Modern processors has multiple way instruction decoders
+For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
+instructions (usually has 3 uops with a cmp/jmp pair that can be
+macro-fused), the decoder would have 2 uops bubble for each iteration
+and the pipeline could not be fully utilized.
+
+Therefore, this patch enables loop unrolling for small size loop at O2
+to fullfill the decoder as much as possible. It turns on rtl loop
+unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
+In x86 backend the default behavior is to unroll small loops with less
+than 4 insns by 1 time.
+
+This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
+0.9% codesize increment. For other benchmarks the variants are minor
+and overall codesize increased by 0.2%.
+
+The kernel image size increased by 0.06%, and no impact on eembc.
+
+gcc/ChangeLog:
+
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable small loop unroll at O2 by default.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled and -funroll-loops/
+	-funroll-all-loops are disabled.
+	* config/i386/i386.h (struct processor_costs): Add 2 field
+	small_unroll_ninsns and small_unroll_factor.
+	* config/i386/i386.opt: Add -munroll-only-small-loops.
+	* doc/invoke.texi: Document -munroll-only-small-loops.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
+	loop unrolling for -O2-speed and above if target hook
+	loop_unroll_adjust exists.
+	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
+	when target hook loop_unroll_adjust exists.
+	* config/i386/x86-tune-costs.h: Update all processor costs
+	with small_unroll_ninsns = 4 and small_unroll_factor = 2.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/guality/loop-1.c: Add additional option
+	-mno-unroll-only-small-loops.
+	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  1 +
+ gcc/config/i386/i386.cc                 | 18 ++++++++
+ gcc/config/i386/i386.h                  |  5 +++
+ gcc/config/i386/i386.opt                |  4 ++
+ gcc/config/i386/x86-tune-costs.h        | 56 +++++++++++++++++++++++++
+ gcc/doc/invoke.texi                     | 11 ++++-
+ gcc/loop-init.cc                        | 10 +++--
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 10 files changed, 105 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index 07fdd045f30..e1c1fb07d8a 100644
+--- a/gcc/common/config/i386/i386-common.cc
++++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
++    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index b16df5b183e..39b2468799c 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+   unsigned i;
+   unsigned mem_count = 0;
+ 
++  /* Unroll small size loop when unroll factor is not explicitly
++     specified.  */
++  if (!(flag_unroll_loops
++	|| flag_unroll_all_loops
++	|| loop->unroll))
++    {
++      nunroll = 1;
++
++      /* Any explicit -f{no-}unroll-{all-}loops turns off
++	 -munroll-only-small-loops.  */
++      if (ix86_unroll_only_small_loops
++	  && !OPTION_SET_P (flag_unroll_loops)
++	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
++	nunroll = ix86_cost->small_unroll_factor;
++
++      return nunroll;
++    }
++
+   if (!TARGET_ADJUST_UNROLL)
+      return nunroll;
+ 
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index a61c32b8957..421801111a7 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -219,6 +219,11 @@ struct processor_costs {
+   const char *const align_jump;		/* Jump alignment.  */
+   const char *const align_label;	/* Label alignment.  */
+   const char *const align_func;		/* Function alignment.  */
++
++  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
++					   to be unrolled.  */
++  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
++					   be unrolled.  */
+ };
+ 
+ extern const struct processor_costs *ix86_cost;
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index a6b0e28f238..3d369647bf7 100644
+--- a/gcc/config/i386/i386.opt
++++ b/gcc/config/i386/i386.opt
+@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
+ -param=x86-stlf-window-ninsns=
+ Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+ Instructions number above which STFL stall penalty can be compensated.
++
++munroll-only-small-loops
++Target Var(ix86_unroll_only_small_loops) Init(0) Save
++Enable conservative small loop unrolling.
+diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
+index 017ffa69958..b4303e4e971 100644
+--- a/gcc/config/i386/x86-tune-costs.h
++++ b/gcc/config/i386/x86-tune-costs.h
+@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Processor costs (relative to an add) */
+@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
+   "4",					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   "4",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs i486_memcpy[2] = {
+@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium_memcpy[2] = {
+@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static const
+@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs geode_memcpy[2] = {
+@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs k6_memcpy[2] = {
+@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* For some reason, Athlon deals better with REP prefix (relative to loops)
+@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* K8 has optimized REP instruction for medium sized blocks, but for very
+@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  BDVER has optimized REP instruction for medium sized blocks, but for
+@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ 
+@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
+@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ struct processor_costs znver3_cost = {
+@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
+@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* icelake_cost should produce code tuned for Icelake family of CPUs.
+@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
+@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs btver2_memcpy[2] = {
+@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium4_memcpy[2] = {
+@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs nocona_memcpy[2] = {
+@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs atom_memcpy[2] = {
+@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs slm_memcpy[2] = {
+@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs tremont_memcpy[2] = {
+@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs intel_memcpy[2] = {
+@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Generic should produce code tuned for Core-i7 (and newer chips)
+@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+ /* core_cost should produce code tuned for Core familly of CPUs.  */
+@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
++  4,					/* Small unroll limit.  */
++  2,					/* Small unroll factor.  */
+ };
+ 
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 9ac7f89ebb1..1961cafa2bb 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options.
+ -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
+ -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
+ -mindirect-branch-register -mharden-sls=@var{choice} @gol
+--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
++-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
++-munroll-only-small-loops}
+ 
+ @emph{x86 Windows Options}
+ @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
+@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols.  The default is
+ @option{-mno-direct-extern-access} and executable compiled with
+ @option{-mdirect-extern-access} may not be binary compatible if
+ protected symbols are used in shared libraries and executable.
++
++@item -munroll-only-small-loops
++@opindex munroll-only-small-loops
++@opindex mno-unroll-only-small-loops
++Controls conservative small loop unrolling. It is default enabled by
++O2, and unrolls loop with less than 4 insns by 1 time. Explicit
++-f[no-]unroll-[all-]loops would disable this flag to avoid any
++unintended unrolling behavior that user does not want.
+ @end table
+ 
+ @node x86 Windows Options
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index 1e4f6cfd7fb..84336865ef7 100644
+--- a/gcc/loop-init.cc
++++ b/gcc/loop-init.cc
+@@ -565,9 +565,12 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function *)
++  virtual bool gate (function * fun)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
++      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
++	      || (targetm.loop_unroll_adjust
++		  && optimize >= 2
++		  && optimize_function_for_speed_p (fun)));
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops)
++      if (flag_unroll_loops
++	  || targetm.loop_unroll_adjust)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index 1b1f6d32271..a32ea445a3f 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,5 +1,7 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
++/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
++
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index 81841ef5bd7..cbc9fbb0450 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
++++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index 0248fcc00a5..f75a847f75d 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
++++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 15b721f..1771eef 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -392,6 +392,7 @@ Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: gcc12-testsuite-typo.patch
 Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -804,6 +805,7 @@ cd ..
 %patch3020 -p1 -b .typo
 %patch5000 -p1 -b .dts-test-20~
 %patch5001 -p1 -b .dts-test-22~
+%patch5002 -p1 -b .dts-test-23~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -3001,6 +3003,7 @@ fi
 * Wed Jul 17 2024 Haochen Jiang <haochen.jiang@intel.com> 12.2.1-7.6.0.1
 - i386: Add syscall to enable AMX for latest kernels
 - Add attribute hot judgement for INLINE_HINT_known_hot hint
+- Enable small loop unrolling for O2
 
 * Wed Apr  3 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.6
 - bump NVR (RHEL-31253)
-- 
Gitee


From 08b519110d055e6bd36f7647e30bf86d8d65cef8 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Tue, 29 Nov 2022 14:22:58 +0800
Subject: [PATCH 5/6] i386: Only enable small loop unrolling in backend [PR
 107692]

Followed by the discussion in pr107692, -munroll-only-small-loops
Does not turns on/off -funroll-loops, and current check in
pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
effect. Revert the change about targetm.loop_unroll_adjust and apply
the backend option change to strictly follow the rule that
-funroll-loops takes full control of loop unrolling, and
munroll-only-small-loops just change its behavior to unroll small size
loops.

gcc/ChangeLog:

	PR target/107692
	* common/config/i386/i386-common.cc (ix86_optimization_table):
	Enable loop unroll O2, disable -fweb and -frename-registers
	by default.
	* config/i386/i386-options.cc
	(ix86_override_options_after_change):
	Disable small loop unroll when funroll-loops enabled, reset
	cunroll_grow_size when it is not explicitly enabled.
	(ix86_option_override_internal): Call
	ix86_override_options_after_change instead of calling
	ix86_recompute_optlev_based_flags and ix86_default_align
	separately.
	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
	factor if -munroll-only-small-loops enabled.
	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
	loop unrolling for -O2-speed.
	(pass_rtl_unroll_loops::execute): Rmove
	targetm.loop_unroll_adjust check.

gcc/testsuite/ChangeLog:

	PR target/107692
	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
	* gcc.target/i386/pr93002.c: Likewise.

url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8caf155a3d6e23e47bf55068ad23c23d4655a054
---
 ...-small-loop-unrolling-in-backend-PR-.patch | 231 ++++++++++++++++++
 gcc.spec                                      |   3 +
 2 files changed, 234 insertions(+)
 create mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch

diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
new file mode 100644
index 0000000..de3995f
--- /dev/null
+++ b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
@@ -0,0 +1,231 @@
+From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Sat, 19 Nov 2022 09:38:00 +0800
+Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR
+ 107692]
+
+Followed by the discussion in pr107692, -munroll-only-small-loops
+Does not turns on/off -funroll-loops, and current check in
+pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
+effect. Revert the change about targetm.loop_unroll_adjust and apply
+the backend option change to strictly follow the rule that
+-funroll-loops takes full control of loop unrolling, and
+munroll-only-small-loops just change its behavior to unroll small size
+loops.
+
+gcc/ChangeLog:
+
+	PR target/107692
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable loop unroll O2, disable -fweb and -frename-registers
+	by default.
+	* config/i386/i386-options.cc
+	(ix86_override_options_after_change):
+	Disable small loop unroll when funroll-loops enabled, reset
+	cunroll_grow_size when it is not explicitly enabled.
+	(ix86_option_override_internal): Call
+	ix86_override_options_after_change instead of calling
+	ix86_recompute_optlev_based_flags and ix86_default_align
+	separately.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
+	loop unrolling for -O2-speed.
+	(pass_rtl_unroll_loops::execute): Rmove
+	targetm.loop_unroll_adjust check.
+
+gcc/testsuite/ChangeLog:
+
+	PR target/107692
+	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
+	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  8 ++++++
+ gcc/config/i386/i386-options.cc         | 34 ++++++++++++++++++++++---
+ gcc/config/i386/i386.cc                 | 18 ++++---------
+ gcc/loop-init.cc                        | 11 +++-----
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 --
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 7 files changed, 49 insertions(+), 28 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index e1c1fb07d8a..5e777849f91 100644
+--- a/gcc/common/config/i386/i386-common.cc
++++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
++
++    /* Turn on -funroll-loops with -munroll-only-small-loops to enable small
++       loop unrolling at -O2.  */
++    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
+     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
++    /* Turns off -frename-registers and -fweb which are enabled by
++       funroll-loops.  */
++    { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
++    { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 32cc58a764b..b853ff55825 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts,
+ void
+ ix86_override_options_after_change (void)
+ {
++  /* Default align_* from the processor table.  */
+   ix86_default_align (&global_options);
++
+   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
++
++  /* Disable unrolling small loops when there's explicit
++     -f{,no}unroll-loop.  */
++  if ((OPTION_SET_P (flag_unroll_loops))
++     || (OPTION_SET_P (flag_unroll_all_loops)
++	 && flag_unroll_all_loops))
++    {
++      if (!OPTION_SET_P (ix86_unroll_only_small_loops))
++	ix86_unroll_only_small_loops = 0;
++      /* Re-enable -frename-registers and -fweb if funroll-loops
++	 enabled.  */
++      if (!OPTION_SET_P (flag_web))
++	flag_web = flag_unroll_loops;
++      if (!OPTION_SET_P (flag_rename_registers))
++	flag_rename_registers = flag_unroll_loops;
++      /* -fcunroll-grow-size default follws -f[no]-unroll-loops.  */
++      if (!OPTION_SET_P (flag_cunroll_grow_size))
++	flag_cunroll_grow_size = flag_unroll_loops
++				 || flag_peel_loops
++				 || optimize >= 3;
++    }
++  else
++    {
++      if (!OPTION_SET_P (flag_cunroll_grow_size))
++	flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
++    }
++
+ }
+ 
+ /* Clear stack slot assignments remembered from previous functions.
+@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p,
+ 
+   set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes);
+ 
+-  ix86_recompute_optlev_based_flags (opts, opts_set);
++  ix86_override_options_after_change ();
+ 
+   ix86_tune_cost = processor_cost_table[ix86_tune];
+   /* TODO: ix86_cost should be chosen at instruction or function granuality
+@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p,
+       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
+     opts->x_ix86_regparm = REGPARM_MAX;
+ 
+-  /* Default align_* from the processor table.  */
+-  ix86_default_align (opts);
+-
+   /* Provide default for -mbranch-cost= value.  */
+   SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost,
+ 		       ix86_tune_cost->branch_cost);
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 39b2468799c..000415c0e2e 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+ 
+   /* Unroll small size loop when unroll factor is not explicitly
+      specified.  */
+-  if (!(flag_unroll_loops
+-	|| flag_unroll_all_loops
+-	|| loop->unroll))
++  if (ix86_unroll_only_small_loops && !loop->unroll)
+     {
+-      nunroll = 1;
+-
+-      /* Any explicit -f{no-}unroll-{all-}loops turns off
+-	 -munroll-only-small-loops.  */
+-      if (ix86_unroll_only_small_loops
+-	  && !OPTION_SET_P (flag_unroll_loops)
+-	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+-	nunroll = ix86_cost->small_unroll_factor;
+-
+-      return nunroll;
++      if (loop->ninsns <= ix86_cost->small_unroll_ninsns)
++	return MIN (nunroll, ix86_cost->small_unroll_factor);
++      else
++	return 1;
+     }
+ 
+   if (!TARGET_ADJUST_UNROLL)
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index 84336865ef7..ed1b2f6ebab 100644
+--- a/gcc/loop-init.cc
++++ b/gcc/loop-init.cc
+@@ -565,12 +565,10 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function * fun)
++  virtual bool gate (function *)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+-	      || (targetm.loop_unroll_adjust
+-		  && optimize >= 2
+-		  && optimize_function_for_speed_p (fun)));
++      return (flag_unroll_loops || flag_unroll_all_loops
++	      || cfun->has_unroll);
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops
+-	  || targetm.loop_unroll_adjust)
++      if (flag_unroll_loops)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index a32ea445a3f..1b1f6d32271 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,7 +1,5 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+-/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+-
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index cbc9fbb0450..98b012caf23 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
++++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
++/* { dg-options "-O2 -fno-unroll-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index f75a847f75d..7e2d869e17b 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
++++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
++/* { dg-options "-O2 -fno-unroll-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 1771eef..05d7e9b 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -393,6 +393,7 @@ Patch3020: gcc12-testsuite-typo.patch
 Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch
+Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
 
 %if 0%{?rhel} == 9
 %global nonsharedver 110
@@ -806,6 +807,7 @@ cd ..
 %patch5000 -p1 -b .dts-test-20~
 %patch5001 -p1 -b .dts-test-22~
 %patch5002 -p1 -b .dts-test-23~
+%patch5003 -p1 -b .dts-test-24~
 
 find gcc/testsuite -name \*.pr96939~ | xargs rm -f
 
@@ -3004,6 +3006,7 @@ fi
 - i386: Add syscall to enable AMX for latest kernels
 - Add attribute hot judgement for INLINE_HINT_known_hot hint
 - Enable small loop unrolling for O2
+- i386: Only enable small loop unrolling in backend [PR 107692]
 
 * Wed Apr  3 2024 Marek Polacek <polacek@redhat.com> 12.2.1-7.6
 - bump NVR (RHEL-31253)
-- 
Gitee


From 04341889f8d21a8b61b79370af48be876972567d Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Wed, 14 Dec 2022 11:12:47 +0800
Subject: [PATCH 6/6] i386: Avoid fma_chain for -march=alderlake and
 sapphirerapids.

For Alderlake there is similar issue like PR 81616, enable
avoid_fma256_chain will also benefit on Intel latest platforms
Alderlake and Sapphire Rapids.

gcc/ChangeLog:

	* config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add
	m_SAPPHIRERAPIDS, m_ALDERLAKE.
---
 ...hain-for-march-alderlake-and-sapphir.patch | 35 +++++++++++++++++++
 gcc.spec                                      |  3 --
 2 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch

diff --git a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch
new file mode 100644
index 0000000..ad65965
--- /dev/null
+++ b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch
@@ -0,0 +1,35 @@
+From b7980cd8d8bcf41b3ca1b6f3ba147789d42a9b99 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Tue, 6 Dec 2022 09:53:35 +0800
+Subject: [PATCH] i386: Avoid fma_chain for -march=alderlake and
+ sapphirerapids.
+
+For Alderlake there is similar issue like PR 81616, enable
+avoid_fma256_chain will also benefit on Intel latest platforms
+Alderlake and Sapphire Rapids.
+
+gcc/ChangeLog:
+
+	* config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add
+	m_SAPPHIRERAPIDS, m_ALDERLAKE.
+---
+ gcc/config/i386/x86-tune.def | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index d983e2f6213..1e1b206a71c 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -485,7 +485,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
+ 
+ /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
+    smaller FMA chain.  */
+-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3)
++DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
++	  | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+ 
+ /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
+    for v2df vector reduction.  */
+-- 
+2.18.2
+
diff --git a/gcc.spec b/gcc.spec
index 05d7e9b..7ad9ac7 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -390,7 +390,6 @@ Patch3017: 0020-more-fixes.patch
 Patch3018: 0021-libstdc++-disable-tests.patch
 Patch3019: 0022-libstdc++-revert-behavior.patch
 Patch3020: gcc12-testsuite-typo.patch
-Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch
 Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
 Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch
 Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
@@ -804,7 +803,6 @@ cd ..
 %patch3019 -p1 -b .dts-test-19~
 %endif
 %patch3020 -p1 -b .typo
-%patch5000 -p1 -b .dts-test-20~
 %patch5001 -p1 -b .dts-test-22~
 %patch5002 -p1 -b .dts-test-23~
 %patch5003 -p1 -b .dts-test-24~
@@ -3003,7 +3001,6 @@ fi
 
 %changelog
 * Wed Jul 17 2024 Haochen Jiang <haochen.jiang@intel.com> 12.2.1-7.6.0.1
-- i386: Add syscall to enable AMX for latest kernels
 - Add attribute hot judgement for INLINE_HINT_known_hot hint
 - Enable small loop unrolling for O2
 - i386: Only enable small loop unrolling in backend [PR 107692]
-- 
Gitee