From b7516c66adbc469e7b7e5b9564467630d917e7fd Mon Sep 17 00:00:00 2001 From: Renbo Date: Wed, 17 Jul 2024 16:37:46 +0800 Subject: [PATCH 1/6] update to gcc-toolset-12-gcc-12.2.1-7.6.src.rpm Signed-off-by: Renbo --- ...all-to-enable-AMX-for-latest-kernels.patch | 77 --- ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 ----- 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ------------------ ...-small-loop-unrolling-in-backend-PR-.patch | 231 --------- ...hain-for-march-alderlake-and-sapphir.patch | 35 -- dist | 2 +- gcc.spec | 26 +- gcc12-pr113960.patch | 107 ++++ 8 files changed, 120 insertions(+), 962 deletions(-) delete mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch delete mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch delete mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch delete mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch delete mode 100644 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch create mode 100644 gcc12-pr113960.patch diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch deleted file mode 100644 index 94625b5..0000000 --- a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang -Date: Thu, 16 Jun 2022 00:15:53 -0700 -Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/amx-check.h (request_perm_xtile_data): - New function to check if AMX is usable and enable AMX. - (main): Run test if AMX is usable. ---- - gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++ - 1 file changed, 30 insertions(+) - -diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h -index 434b0e59703..6fff5ff4631 100644 ---- a/gcc/testsuite/gcc.target/i386/amx-check.h -+++ b/gcc/testsuite/gcc.target/i386/amx-check.h -@@ -4,11 +4,24 @@ - #include - #include - #include -+#include -+#ifdef __linux__ -+#include -+#endif - #ifdef DEBUG - #include - #endif - #include "cpuid.h" - -+#define XFEATURE_XTILECFG 17 -+#define XFEATURE_XTILEDATA 18 -+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) -+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) -+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -+ -+#define ARCH_GET_XCOMP_PERM 0x1022 -+#define ARCH_REQ_XCOMP_PERM 0x1023 -+ - /* TODO: The tmm emulation is temporary for current - AMX implementation with no tmm regclass, should - be changed in the future. */ -@@ -44,6 +57,20 @@ typedef struct __tile - /* Stride (colum width in byte) used for tileload/store */ - #define _STRIDE 64 - -+#ifdef __linux__ -+/* We need syscall to use amx functions */ -+int request_perm_xtile_data() -+{ -+ unsigned long bitmask; -+ -+ if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) || -+ syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask)) -+ return 0; -+ -+ return (bitmask & XFEATURE_MASK_XTILE) != 0; -+} -+#endif -+ - /* Initialize tile config by setting all tmm size to 16x64 */ - void init_tile_config (__tilecfg_u *dst) - { -@@ -185,6 +212,9 @@ main () - #endif - #ifdef AMX_BF16 - && __builtin_cpu_supports ("amx-bf16") -+#endif -+#ifdef __linux__ -+ && request_perm_xtile_data () - #endif - ) - { --- -2.18.2 - diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch deleted file mode 100644 index 3e70f0c..0000000 --- a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001 -From: "Cui,Lili" -Date: Tue, 1 Nov 2022 09:16:49 +0800 -Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint. - -We set up INLINE_HINT_known_hot hint only when we have profile feedback, -now add function attribute judgement for it, when both caller and callee -have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint -for it. - -With this patch applied, -ADL Multi-copy: 538.imagic_r 16.7% -ICX Multi-copy: 538.imagic_r 15.2% -CLX Multi-copy: 538.imagic_r 12.7% -Znver3 Multi-copy: 538.imagic_r 10.6% -Arm Multi-copy: 538.imagic_r 13.4% - -gcc/ChangeLog - - * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute - judgement for INLINE_HINT_known_hot hint. - -gcc/testsuite/ChangeLog: - - * gcc.dg/ipa/inlinehint-6.c: New test. ---- - gcc/ipa-inline-analysis.cc | 13 ++++--- - gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++ - 2 files changed, 56 insertions(+), 4 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c - -diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc -index 1ca685d1b0e..7bd29c36590 100644 ---- a/gcc/ipa-inline-analysis.cc -+++ b/gcc/ipa-inline-analysis.cc -@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see - #include "ipa-utils.h" - #include "cfgexpand.h" - #include "gimplify.h" -+#include "attribs.h" - - /* Cached node/edge growths. */ - fast_call_summary *edge_growth_cache = NULL; -@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time) - hints = estimates.hints; - } - -- /* When we have profile feedback, we can quite safely identify hot -- edges and for those we disable size limits. Don't do that when -- probability that caller will call the callee is low however, since it -+ /* When we have profile feedback or function attribute, we can quite safely -+ identify hot edges and for those we disable size limits. Don't do that -+ when probability that caller will call the callee is low however, since it - may hurt optimization of the caller's hot path. */ -- if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p () -+ if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p () - && (edge->count.ipa ().apply_scale (2, 1) - > (edge->caller->inlined_to - ? edge->caller->inlined_to->count.ipa () - : edge->caller->count.ipa ()))) -+ || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl)) -+ != NULL -+ && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl)) -+ != NULL)) - hints |= INLINE_HINT_known_hot; - - gcc_checking_assert (size >= 0); -diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c -new file mode 100644 -index 00000000000..1f3be641c6d ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c -@@ -0,0 +1,47 @@ -+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp" } */ -+/* { dg-add-options bind_pic_locally } */ -+ -+#define size_t long long int -+ -+struct A -+{ -+ size_t f1, f2, f3, f4; -+}; -+struct C -+{ -+ struct A a; -+ size_t b; -+}; -+struct C x; -+ -+__attribute__((hot)) struct C callee (struct A *a, struct C *c) -+{ -+ c->a=(*a); -+ -+ if((c->b + 7) & 17) -+ { -+ c->a.f1 = c->a.f2 + c->a.f1; -+ c->a.f2 = c->a.f3 - c->a.f2; -+ c->a.f3 = c->a.f2 + c->a.f3; -+ c->a.f4 = c->a.f2 - c->a.f4; -+ c->b = c->a.f2; -+ -+ } -+ return *c; -+} -+ -+__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c) -+{ -+ struct A a; -+ a.f1 = 1 + d; -+ a.f2 = e; -+ a.f3 = 12 + f; -+ a.f4 = 68 + g; -+ if (c->b > 0) -+ return callee (&a, c); -+ else -+ return *c; -+} -+ -+/* { dg-final { scan-ipa-dump "known_hot" "inline" } } */ -+ --- -2.18.2 - diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch deleted file mode 100644 index b16171b..0000000 --- a/0026-Enable-small-loop-unrolling-for-O2.patch +++ /dev/null @@ -1,481 +0,0 @@ -From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Thu, 8 Sep 2022 16:52:02 +0800 -Subject: [PATCH 4/5] Enable small loop unrolling for O2 - -Modern processors has multiple way instruction decoders -For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 -instructions (usually has 3 uops with a cmp/jmp pair that can be -macro-fused), the decoder would have 2 uops bubble for each iteration -and the pipeline could not be fully utilized. - -Therefore, this patch enables loop unrolling for small size loop at O2 -to fullfill the decoder as much as possible. It turns on rtl loop -unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. -In x86 backend the default behavior is to unroll small loops with less -than 4 insns by 1 time. - -This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with -0.9% codesize increment. For other benchmarks the variants are minor -and overall codesize increased by 0.2%. - -The kernel image size increased by 0.06%, and no impact on eembc. - -gcc/ChangeLog: - - * common/config/i386/i386-common.cc (ix86_optimization_table): - Enable small loop unroll at O2 by default. - * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll - factor if -munroll-only-small-loops enabled and -funroll-loops/ - -funroll-all-loops are disabled. - * config/i386/i386.h (struct processor_costs): Add 2 field - small_unroll_ninsns and small_unroll_factor. - * config/i386/i386.opt: Add -munroll-only-small-loops. - * doc/invoke.texi: Document -munroll-only-small-loops. - * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl - loop unrolling for -O2-speed and above if target hook - loop_unroll_adjust exists. - (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag - when target hook loop_unroll_adjust exists. - * config/i386/x86-tune-costs.h: Update all processor costs - with small_unroll_ninsns = 4 and small_unroll_factor = 2. - -gcc/testsuite/ChangeLog: - - * gcc.dg/guality/loop-1.c: Add additional option - -mno-unroll-only-small-loops. - * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. - * gcc.target/i386/pr93002.c: Likewise. ---- - gcc/common/config/i386/i386-common.cc | 1 + - gcc/config/i386/i386.cc | 18 ++++++++ - gcc/config/i386/i386.h | 5 +++ - gcc/config/i386/i386.opt | 4 ++ - gcc/config/i386/x86-tune-costs.h | 56 +++++++++++++++++++++++++ - gcc/doc/invoke.texi | 11 ++++- - gcc/loop-init.cc | 10 +++-- - gcc/testsuite/gcc.dg/guality/loop-1.c | 2 + - gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- - gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- - 10 files changed, 105 insertions(+), 6 deletions(-) - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 07fdd045f30..e1c1fb07d8a 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] = - /* The STC algorithm produces the smallest code at -Os, for x86. */ - { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, - REORDER_BLOCKS_ALGORITHM_STC }, -+ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, - /* Turn off -fschedule-insns by default. It tends to make the - problem with not enough registers even worse. */ - { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index b16df5b183e..39b2468799c 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) - unsigned i; - unsigned mem_count = 0; - -+ /* Unroll small size loop when unroll factor is not explicitly -+ specified. */ -+ if (!(flag_unroll_loops -+ || flag_unroll_all_loops -+ || loop->unroll)) -+ { -+ nunroll = 1; -+ -+ /* Any explicit -f{no-}unroll-{all-}loops turns off -+ -munroll-only-small-loops. */ -+ if (ix86_unroll_only_small_loops -+ && !OPTION_SET_P (flag_unroll_loops) -+ && loop->ninsns <= ix86_cost->small_unroll_ninsns) -+ nunroll = ix86_cost->small_unroll_factor; -+ -+ return nunroll; -+ } -+ - if (!TARGET_ADJUST_UNROLL) - return nunroll; - -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index a61c32b8957..421801111a7 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -219,6 +219,11 @@ struct processor_costs { - const char *const align_jump; /* Jump alignment. */ - const char *const align_label; /* Label alignment. */ - const char *const align_func; /* Function alignment. */ -+ -+ const unsigned small_unroll_ninsns; /* Insn count limit for small loop -+ to be unrolled. */ -+ const unsigned small_unroll_factor; /* Unroll factor for small loop to -+ be unrolled. */ - }; - - extern const struct processor_costs *ix86_cost; -diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt -index a6b0e28f238..3d369647bf7 100644 ---- a/gcc/config/i386/i386.opt -+++ b/gcc/config/i386/i386.opt -@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols. - -param=x86-stlf-window-ninsns= - Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param - Instructions number above which STFL stall penalty can be compensated. -+ -+munroll-only-small-loops -+Target Var(ix86_unroll_only_small_loops) Init(0) Save -+Enable conservative small loop unrolling. -diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h -index 017ffa69958..b4303e4e971 100644 ---- a/gcc/config/i386/x86-tune-costs.h -+++ b/gcc/config/i386/x86-tune-costs.h -@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* Processor costs (relative to an add) */ -@@ -244,6 +246,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ - "4", /* Jump alignment. */ - NULL, /* Label alignment. */ - "4", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs i486_memcpy[2] = { -@@ -354,6 +358,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs pentium_memcpy[2] = { -@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static const -@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes -@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs geode_memcpy[2] = { -@@ -786,6 +798,8 @@ struct processor_costs geode_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs k6_memcpy[2] = { -@@ -896,6 +910,8 @@ struct processor_costs k6_cost = { - "32:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "32", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* For some reason, Athlon deals better with REP prefix (relative to loops) -@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* K8 has optimized REP instruction for medium sized blocks, but for very -@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for -@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = { - "32:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "32", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* BDVER has optimized REP instruction for medium sized blocks, but for -@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - -@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* ZNVER2 has optimized REP instruction for medium sized blocks, but for -@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - struct processor_costs znver3_cost = { -@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ -@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* icelake_cost should produce code tuned for Icelake family of CPUs. -@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* alderlake_cost should produce code tuned for alderlake family of CPUs. */ -@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* BTVER1 has optimized REP instruction for medium sized blocks, but for -@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs btver2_memcpy[2] = { -@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs pentium4_memcpy[2] = { -@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs nocona_memcpy[2] = { -@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs atom_memcpy[2] = { -@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs slm_memcpy[2] = { -@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs tremont_memcpy[2] = { -@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs intel_memcpy[2] = { -@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* Generic should produce code tuned for Core-i7 (and newer chips) -@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* core_cost should produce code tuned for Core familly of CPUs. */ -@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - -diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index 9ac7f89ebb1..1961cafa2bb 100644 ---- a/gcc/doc/invoke.texi -+++ b/gcc/doc/invoke.texi -@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options. - -mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol - -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol - -mindirect-branch-register -mharden-sls=@var{choice} @gol ---mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access} -+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol -+-munroll-only-small-loops} - - @emph{x86 Windows Options} - @gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol -@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols. The default is - @option{-mno-direct-extern-access} and executable compiled with - @option{-mdirect-extern-access} may not be binary compatible if - protected symbols are used in shared libraries and executable. -+ -+@item -munroll-only-small-loops -+@opindex munroll-only-small-loops -+@opindex mno-unroll-only-small-loops -+Controls conservative small loop unrolling. It is default enabled by -+O2, and unrolls loop with less than 4 insns by 1 time. Explicit -+-f[no-]unroll-[all-]loops would disable this flag to avoid any -+unintended unrolling behavior that user does not want. - @end table - - @node x86 Windows Options -diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc -index 1e4f6cfd7fb..84336865ef7 100644 ---- a/gcc/loop-init.cc -+++ b/gcc/loop-init.cc -@@ -565,9 +565,12 @@ public: - {} - - /* opt_pass methods: */ -- virtual bool gate (function *) -+ virtual bool gate (function * fun) - { -- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll); -+ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll -+ || (targetm.loop_unroll_adjust -+ && optimize >= 2 -+ && optimize_function_for_speed_p (fun))); - } - - virtual unsigned int execute (function *); -@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun) - if (dump_file) - df_dump (dump_file); - -- if (flag_unroll_loops) -+ if (flag_unroll_loops -+ || targetm.loop_unroll_adjust) - flags |= UAP_UNROLL; - if (flag_unroll_all_loops) - flags |= UAP_UNROLL_ALL; -diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c -index 1b1f6d32271..a32ea445a3f 100644 ---- a/gcc/testsuite/gcc.dg/guality/loop-1.c -+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c -@@ -1,5 +1,7 @@ - /* { dg-do run } */ - /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ -+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ -+ - - #include "../nop.h" - -diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c -index 81841ef5bd7..cbc9fbb0450 100644 ---- a/gcc/testsuite/gcc.target/i386/pr86270.c -+++ b/gcc/testsuite/gcc.target/i386/pr86270.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O2" } */ -+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ - - int *a; - long len; -diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c -index 0248fcc00a5..f75a847f75d 100644 ---- a/gcc/testsuite/gcc.target/i386/pr93002.c -+++ b/gcc/testsuite/gcc.target/i386/pr93002.c -@@ -1,6 +1,6 @@ - /* PR target/93002 */ - /* { dg-do compile } */ --/* { dg-options "-O2" } */ -+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ - /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ - - volatile int sink; --- -2.18.2 - diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch deleted file mode 100644 index de3995f..0000000 --- a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch +++ /dev/null @@ -1,231 +0,0 @@ -From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Sat, 19 Nov 2022 09:38:00 +0800 -Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR - 107692] - -Followed by the discussion in pr107692, -munroll-only-small-loops -Does not turns on/off -funroll-loops, and current check in -pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take -effect. Revert the change about targetm.loop_unroll_adjust and apply -the backend option change to strictly follow the rule that --funroll-loops takes full control of loop unrolling, and -munroll-only-small-loops just change its behavior to unroll small size -loops. - -gcc/ChangeLog: - - PR target/107692 - * common/config/i386/i386-common.cc (ix86_optimization_table): - Enable loop unroll O2, disable -fweb and -frename-registers - by default. - * config/i386/i386-options.cc - (ix86_override_options_after_change): - Disable small loop unroll when funroll-loops enabled, reset - cunroll_grow_size when it is not explicitly enabled. - (ix86_option_override_internal): Call - ix86_override_options_after_change instead of calling - ix86_recompute_optlev_based_flags and ix86_default_align - separately. - * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll - factor if -munroll-only-small-loops enabled. - * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable - loop unrolling for -O2-speed. - (pass_rtl_unroll_loops::execute): Rmove - targetm.loop_unroll_adjust check. - -gcc/testsuite/ChangeLog: - - PR target/107692 - * gcc.dg/guality/loop-1.c: Remove additional option for ia32. - * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. - * gcc.target/i386/pr93002.c: Likewise. ---- - gcc/common/config/i386/i386-common.cc | 8 ++++++ - gcc/config/i386/i386-options.cc | 34 ++++++++++++++++++++++--- - gcc/config/i386/i386.cc | 18 ++++--------- - gcc/loop-init.cc | 11 +++----- - gcc/testsuite/gcc.dg/guality/loop-1.c | 2 -- - gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- - gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- - 7 files changed, 49 insertions(+), 28 deletions(-) - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index e1c1fb07d8a..5e777849f91 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] = - /* The STC algorithm produces the smallest code at -Os, for x86. */ - { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, - REORDER_BLOCKS_ALGORITHM_STC }, -+ -+ /* Turn on -funroll-loops with -munroll-only-small-loops to enable small -+ loop unrolling at -O2. */ -+ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 }, - { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, -+ /* Turns off -frename-registers and -fweb which are enabled by -+ funroll-loops. */ -+ { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 }, -+ { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 }, - /* Turn off -fschedule-insns by default. It tends to make the - problem with not enough registers even worse. */ - { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 32cc58a764b..b853ff55825 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts, - void - ix86_override_options_after_change (void) - { -+ /* Default align_* from the processor table. */ - ix86_default_align (&global_options); -+ - ix86_recompute_optlev_based_flags (&global_options, &global_options_set); -+ -+ /* Disable unrolling small loops when there's explicit -+ -f{,no}unroll-loop. */ -+ if ((OPTION_SET_P (flag_unroll_loops)) -+ || (OPTION_SET_P (flag_unroll_all_loops) -+ && flag_unroll_all_loops)) -+ { -+ if (!OPTION_SET_P (ix86_unroll_only_small_loops)) -+ ix86_unroll_only_small_loops = 0; -+ /* Re-enable -frename-registers and -fweb if funroll-loops -+ enabled. */ -+ if (!OPTION_SET_P (flag_web)) -+ flag_web = flag_unroll_loops; -+ if (!OPTION_SET_P (flag_rename_registers)) -+ flag_rename_registers = flag_unroll_loops; -+ /* -fcunroll-grow-size default follws -f[no]-unroll-loops. */ -+ if (!OPTION_SET_P (flag_cunroll_grow_size)) -+ flag_cunroll_grow_size = flag_unroll_loops -+ || flag_peel_loops -+ || optimize >= 3; -+ } -+ else -+ { -+ if (!OPTION_SET_P (flag_cunroll_grow_size)) -+ flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; -+ } -+ - } - - /* Clear stack slot assignments remembered from previous functions. -@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p, - - set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes); - -- ix86_recompute_optlev_based_flags (opts, opts_set); -+ ix86_override_options_after_change (); - - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality -@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p, - || TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_regparm = REGPARM_MAX; - -- /* Default align_* from the processor table. */ -- ix86_default_align (opts); -- - /* Provide default for -mbranch-cost= value. */ - SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost, - ix86_tune_cost->branch_cost); -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index 39b2468799c..000415c0e2e 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) - - /* Unroll small size loop when unroll factor is not explicitly - specified. */ -- if (!(flag_unroll_loops -- || flag_unroll_all_loops -- || loop->unroll)) -+ if (ix86_unroll_only_small_loops && !loop->unroll) - { -- nunroll = 1; -- -- /* Any explicit -f{no-}unroll-{all-}loops turns off -- -munroll-only-small-loops. */ -- if (ix86_unroll_only_small_loops -- && !OPTION_SET_P (flag_unroll_loops) -- && loop->ninsns <= ix86_cost->small_unroll_ninsns) -- nunroll = ix86_cost->small_unroll_factor; -- -- return nunroll; -+ if (loop->ninsns <= ix86_cost->small_unroll_ninsns) -+ return MIN (nunroll, ix86_cost->small_unroll_factor); -+ else -+ return 1; - } - - if (!TARGET_ADJUST_UNROLL) -diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc -index 84336865ef7..ed1b2f6ebab 100644 ---- a/gcc/loop-init.cc -+++ b/gcc/loop-init.cc -@@ -565,12 +565,10 @@ public: - {} - - /* opt_pass methods: */ -- virtual bool gate (function * fun) -+ virtual bool gate (function *) - { -- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll -- || (targetm.loop_unroll_adjust -- && optimize >= 2 -- && optimize_function_for_speed_p (fun))); -+ return (flag_unroll_loops || flag_unroll_all_loops -+ || cfun->has_unroll); - } - - virtual unsigned int execute (function *); -@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun) - if (dump_file) - df_dump (dump_file); - -- if (flag_unroll_loops -- || targetm.loop_unroll_adjust) -+ if (flag_unroll_loops) - flags |= UAP_UNROLL; - if (flag_unroll_all_loops) - flags |= UAP_UNROLL_ALL; -diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c -index a32ea445a3f..1b1f6d32271 100644 ---- a/gcc/testsuite/gcc.dg/guality/loop-1.c -+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c -@@ -1,7 +1,5 @@ - /* { dg-do run } */ - /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ --/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ -- - - #include "../nop.h" - -diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c -index cbc9fbb0450..98b012caf23 100644 ---- a/gcc/testsuite/gcc.target/i386/pr86270.c -+++ b/gcc/testsuite/gcc.target/i386/pr86270.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ -+/* { dg-options "-O2 -fno-unroll-loops" } */ - - int *a; - long len; -diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c -index f75a847f75d..7e2d869e17b 100644 ---- a/gcc/testsuite/gcc.target/i386/pr93002.c -+++ b/gcc/testsuite/gcc.target/i386/pr93002.c -@@ -1,6 +1,6 @@ - /* PR target/93002 */ - /* { dg-do compile } */ --/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ -+/* { dg-options "-O2 -fno-unroll-loops" } */ - /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ - - volatile int sink; --- -2.18.2 - diff --git a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch deleted file mode 100644 index ad65965..0000000 --- a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch +++ /dev/null @@ -1,35 +0,0 @@ -From b7980cd8d8bcf41b3ca1b6f3ba147789d42a9b99 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Tue, 6 Dec 2022 09:53:35 +0800 -Subject: [PATCH] i386: Avoid fma_chain for -march=alderlake and - sapphirerapids. - -For Alderlake there is similar issue like PR 81616, enable -avoid_fma256_chain will also benefit on Intel latest platforms -Alderlake and Sapphire Rapids. - -gcc/ChangeLog: - - * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add - m_SAPPHIRERAPIDS, m_ALDERLAKE. ---- - gcc/config/i386/x86-tune.def | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def -index d983e2f6213..1e1b206a71c 100644 ---- a/gcc/config/i386/x86-tune.def -+++ b/gcc/config/i386/x86-tune.def -@@ -485,7 +485,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) - - /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or - smaller FMA chain. */ --DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3) -+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 -+ | m_ALDERLAKE | m_SAPPHIRERAPIDS) - - /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd - for v2df vector reduction. */ --- -2.18.2 - diff --git a/dist b/dist index 9c0e36e..1fe92cf 100644 --- a/dist +++ b/dist @@ -1 +1 @@ -an8 +an8_10 diff --git a/gcc.spec b/gcc.spec index 4622976..b7955aa 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,4 +1,6 @@ -%define anolis_release .0.1 +%{?scl_package:%global scl gcc-toolset-12} +%global scl_prefix gcc-toolset-12- +BuildRequires: scl-utils-build %global __python /usr/bin/python3 %{?scl:%global __strip %%{_scl_root}/usr/bin/strip} %{?scl:%global __objdump %%{_scl_root}/usr/bin/objdump} @@ -148,7 +150,7 @@ Summary: GCC version 12 Name: %{?scl_prefix}gcc Version: %{gcc_version} -Release: %{gcc_release}.4%{anolis_release}%{?dist} +Release: %{gcc_release}.6%{?dist} # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have # GCC Runtime Exception. License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD @@ -194,10 +196,8 @@ URL: http://gcc.gnu.org # Need binutils which support -plugin # Need binutils which support .loc view >= 2.30 # Need binutils which support --generate-missing-build-notes=yes >= 2.31 -%if 0%{?scl:1} BuildRequires: %{?scl_prefix}binutils >= 2.31 BuildRequires: %{?scl_prefix}gdb >= 7.4.50 -%endif # While gcc doesn't include statically linked binaries, during testing # -static is used several times. BuildRequires: glibc-static @@ -352,6 +352,7 @@ Patch11: gcc12-d-shared-libphobos.patch Patch12: gcc12-pr107468.patch Patch15: gcc12-static-libquadmath.patch Patch16: gcc12-FMA-chains.patch +Patch17: gcc12-pr113960.patch Patch100: gcc12-fortran-fdec-duplicates.patch Patch101: gcc12-fortran-flogical-as-integer.patch @@ -388,9 +389,6 @@ Patch3017: 0020-more-fixes.patch Patch3018: 0021-libstdc++-disable-tests.patch Patch3019: 0022-libstdc++-revert-behavior.patch Patch3020: gcc12-testsuite-typo.patch -Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch -Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch -Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -735,6 +733,7 @@ so that there cannot be any synchronization problems. %patch12 -p0 -b .pr107468~ %patch15 -p0 -b .static-libquadmath~ %patch16 -p1 -b .fma~ +%patch17 -p1 -b .pr113960~ %if 0%{?rhel} >= 6 %patch100 -p1 -b .fortran-fdec-duplicates~ @@ -800,9 +799,6 @@ cd .. %patch3019 -p1 -b .dts-test-19~ %endif %patch3020 -p1 -b .typo -%patch5001 -p1 -b .dts-test-22~ -%patch5002 -p1 -b .dts-test-23~ -%patch5003 -p1 -b .dts-test-24~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -2997,10 +2993,12 @@ fi %endif %changelog -* Tue May 30 2023 Haochen Jiang 12.2.1-7.4.0.1 -- Add attribute hot judgement for INLINE_HINT_known_hot hint -- Enable small loop unrolling for O2 -- i386: Only enable small loop unrolling in backend [PR 107692] +* Wed Apr 3 2024 Marek Polacek 12.2.1-7.6 +- bump NVR (RHEL-31253) + +* Tue Mar 26 2024 Marek Polacek 12.2.1-7.5 +- fix conditions for using memcmp in + std::lexicographical_compare_three_way (PR libstdc++/113960, RHEL-29952) * Fri Feb 10 2023 Marek Polacek 12.2.1-7.4 - avoid fma_chain for -march=alderlake and sapphirerapids (#2168917) diff --git a/gcc12-pr113960.patch b/gcc12-pr113960.patch new file mode 100644 index 0000000..98d2f86 --- /dev/null +++ b/gcc12-pr113960.patch @@ -0,0 +1,107 @@ +commit 6f5dcea85a31845ec6f4b6886734b0f02e013718 +Author: Jonathan Wakely +Date: Tue Feb 27 17:50:34 2024 +0000 + + libstdc++: Fix conditions for using memcmp in std::lexicographical_compare_three_way [PR113960] + + The change in r11-2981-g2f983fa69005b6 meant that + std::lexicographical_compare_three_way started to use memcmp for + unsigned integers on big endian targets, but for that to be valid we + need the two value types to have the same size and we need to use that + size to compute the length passed to memcmp. + + I already defined a __is_memcmp_ordered_with trait that does the right + checks, std::lexicographical_compare_three_way just needs to use it. + + libstdc++-v3/ChangeLog: + + PR libstdc++/113960 + * include/bits/stl_algobase.h (__is_byte_iter): Replace with ... + (__memcmp_ordered_with): New concept. + (lexicographical_compare_three_way): Use __memcmp_ordered_with + instead of __is_byte_iter. Use correct length for memcmp. + * testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc: + New test. + + (cherry picked from commit f5cdda8acb06c20335855ed353ab9a441c12128a) + +diff --git a/libstdc++-v3/include/bits/stl_algobase.h b/libstdc++-v3/include/bits/stl_algobase.h +index 7664301a208..6e648e48ad0 100644 +--- a/libstdc++-v3/include/bits/stl_algobase.h ++++ b/libstdc++-v3/include/bits/stl_algobase.h +@@ -1780,11 +1780,14 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO + } + + #if __cpp_lib_three_way_comparison +- // Iter points to a contiguous range of unsigned narrow character type +- // or std::byte, suitable for comparison by memcmp. +- template +- concept __is_byte_iter = contiguous_iterator<_Iter> +- && __is_memcmp_ordered>::__value; ++ // Both iterators refer to contiguous ranges of unsigned narrow characters, ++ // or std::byte, or big-endian unsigned integers, suitable for comparison ++ // using memcmp. ++ template ++ concept __memcmp_ordered_with ++ = (__is_memcmp_ordered_with, ++ iter_value_t<_Iter2>>::__value) ++ && contiguous_iterator<_Iter1> && contiguous_iterator<_Iter2>; + + // Return a struct with two members, initialized to the smaller of x and y + // (or x if they compare equal) and the result of the comparison x <=> y. +@@ -1834,20 +1837,20 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO + if (!std::__is_constant_evaluated()) + if constexpr (same_as<_Comp, __detail::_Synth3way> + || same_as<_Comp, compare_three_way>) +- if constexpr (__is_byte_iter<_InputIter1>) +- if constexpr (__is_byte_iter<_InputIter2>) +- { +- const auto [__len, __lencmp] = _GLIBCXX_STD_A:: +- __min_cmp(__last1 - __first1, __last2 - __first2); +- if (__len) +- { +- const auto __c +- = __builtin_memcmp(&*__first1, &*__first2, __len) <=> 0; +- if (__c != 0) +- return __c; +- } +- return __lencmp; +- } ++ if constexpr (__memcmp_ordered_with<_InputIter1, _InputIter2>) ++ { ++ const auto [__len, __lencmp] = _GLIBCXX_STD_A:: ++ __min_cmp(__last1 - __first1, __last2 - __first2); ++ if (__len) ++ { ++ const auto __blen = __len * sizeof(*__first1); ++ const auto __c ++ = __builtin_memcmp(&*__first1, &*__first2, __blen) <=> 0; ++ if (__c != 0) ++ return __c; ++ } ++ return __lencmp; ++ } + + while (__first1 != __last1) + { +diff --git a/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc b/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc +new file mode 100644 +index 00000000000..d51ae1a3d50 +--- /dev/null ++++ b/libstdc++-v3/testsuite/25_algorithms/lexicographical_compare_three_way/113960.cc +@@ -0,0 +1,15 @@ ++// { dg-do run { target c++20 } } ++ ++// PR libstdc++/113960 ++// std::map with std::vector as input overwrites itself with c++20, on s390x ++ ++#include ++#include ++ ++int main() ++{ ++ unsigned short a1[] { 1, 2, 3 }; ++ unsigned short a2[] { 1, 2, 4 }; ++ // Incorrect memcmp comparison for big endian targets. ++ VERIFY( std::lexicographical_compare_three_way(a1, a1+3, a2, a2+3) < 0 ); ++} -- Gitee From c7167efd4293aa18e9f56fd57bea8cad688880ea Mon Sep 17 00:00:00 2001 From: Haochen Jiang Date: Thu, 10 Nov 2022 09:40:26 +0800 Subject: [PATCH 2/6] i386: Add syscall to enable AMX for latest kernels gcc/testsuite/ChangeLog: * gcc.target/i386/amx-check.h (request_perm_xtile_data): New function to check if AMX is usable and enable AMX. (main): Run test if AMX is usable. url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5e377d21f1f345d8b157b9bc306e02bb9bd45e01 --- ...all-to-enable-AMX-for-latest-kernels.patch | 77 +++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch new file mode 100644 index 0000000..94625b5 --- /dev/null +++ b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch @@ -0,0 +1,77 @@ +From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Thu, 16 Jun 2022 00:15:53 -0700 +Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/amx-check.h (request_perm_xtile_data): + New function to check if AMX is usable and enable AMX. + (main): Run test if AMX is usable. +--- + gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h +index 434b0e59703..6fff5ff4631 100644 +--- a/gcc/testsuite/gcc.target/i386/amx-check.h ++++ b/gcc/testsuite/gcc.target/i386/amx-check.h +@@ -4,11 +4,24 @@ + #include + #include + #include ++#include ++#ifdef __linux__ ++#include ++#endif + #ifdef DEBUG + #include + #endif + #include "cpuid.h" + ++#define XFEATURE_XTILECFG 17 ++#define XFEATURE_XTILEDATA 18 ++#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) ++#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) ++#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) ++ ++#define ARCH_GET_XCOMP_PERM 0x1022 ++#define ARCH_REQ_XCOMP_PERM 0x1023 ++ + /* TODO: The tmm emulation is temporary for current + AMX implementation with no tmm regclass, should + be changed in the future. */ +@@ -44,6 +57,20 @@ typedef struct __tile + /* Stride (colum width in byte) used for tileload/store */ + #define _STRIDE 64 + ++#ifdef __linux__ ++/* We need syscall to use amx functions */ ++int request_perm_xtile_data() ++{ ++ unsigned long bitmask; ++ ++ if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) || ++ syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask)) ++ return 0; ++ ++ return (bitmask & XFEATURE_MASK_XTILE) != 0; ++} ++#endif ++ + /* Initialize tile config by setting all tmm size to 16x64 */ + void init_tile_config (__tilecfg_u *dst) + { +@@ -185,6 +212,9 @@ main () + #endif + #ifdef AMX_BF16 + && __builtin_cpu_supports ("amx-bf16") ++#endif ++#ifdef __linux__ ++ && request_perm_xtile_data () + #endif + ) + { +-- +2.18.2 + diff --git a/gcc.spec b/gcc.spec index b7955aa..562650f 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,3 +1,4 @@ +%define anolis_release .0.1 %{?scl_package:%global scl gcc-toolset-12} %global scl_prefix gcc-toolset-12- BuildRequires: scl-utils-build @@ -150,7 +151,7 @@ BuildRequires: scl-utils-build Summary: GCC version 12 Name: %{?scl_prefix}gcc Version: %{gcc_version} -Release: %{gcc_release}.6%{?dist} +Release: %{gcc_release}.6%{anolis_release}%{?dist} # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have # GCC Runtime Exception. License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD @@ -389,6 +390,7 @@ Patch3017: 0020-more-fixes.patch Patch3018: 0021-libstdc++-disable-tests.patch Patch3019: 0022-libstdc++-revert-behavior.patch Patch3020: gcc12-testsuite-typo.patch +Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -799,6 +801,7 @@ cd .. %patch3019 -p1 -b .dts-test-19~ %endif %patch3020 -p1 -b .typo +%patch5000 -p1 -b .dts-test-20~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -2993,6 +2996,9 @@ fi %endif %changelog +* Wed Jul 17 2024 Haochen Jiang 12.2.1-7.6.0.1 +- i386: Add syscall to enable AMX for latest kernels + * Wed Apr 3 2024 Marek Polacek 12.2.1-7.6 - bump NVR (RHEL-31253) -- Gitee From fc63f978f187017c26dd4840481f1aa9aef4cdd9 Mon Sep 17 00:00:00 2001 From: "Cui,Lili" Date: Thu, 10 Nov 2022 09:48:30 +0800 Subject: [PATCH 3/6] Add attribute hot judgement for INLINE_HINT_known_hot hint. We set up INLINE_HINT_known_hot hint only when we have profile feedback, now add function attribute judgement for it, when both caller and callee have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint for it. With this patch applied, ADL Multi-copy: 538.imagic_r 16.7% ICX Multi-copy: 538.imagic_r 15.2% CLX Multi-copy: 538.imagic_r 12.7% Znver3 Multi-copy: 538.imagic_r 10.6% Arm Multi-copy: 538.imagic_r 13.4% gcc/ChangeLog * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute judgement for INLINE_HINT_known_hot hint. gcc/testsuite/ChangeLog: * gcc.dg/ipa/inlinehint-6.c: New test. url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a --- ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 ++++++++++++++++++ gcc.spec | 3 + 2 files changed, 126 insertions(+) create mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch new file mode 100644 index 0000000..3e70f0c --- /dev/null +++ b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch @@ -0,0 +1,123 @@ +From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001 +From: "Cui,Lili" +Date: Tue, 1 Nov 2022 09:16:49 +0800 +Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint. + +We set up INLINE_HINT_known_hot hint only when we have profile feedback, +now add function attribute judgement for it, when both caller and callee +have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint +for it. + +With this patch applied, +ADL Multi-copy: 538.imagic_r 16.7% +ICX Multi-copy: 538.imagic_r 15.2% +CLX Multi-copy: 538.imagic_r 12.7% +Znver3 Multi-copy: 538.imagic_r 10.6% +Arm Multi-copy: 538.imagic_r 13.4% + +gcc/ChangeLog + + * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute + judgement for INLINE_HINT_known_hot hint. + +gcc/testsuite/ChangeLog: + + * gcc.dg/ipa/inlinehint-6.c: New test. +--- + gcc/ipa-inline-analysis.cc | 13 ++++--- + gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++ + 2 files changed, 56 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c + +diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc +index 1ca685d1b0e..7bd29c36590 100644 +--- a/gcc/ipa-inline-analysis.cc ++++ b/gcc/ipa-inline-analysis.cc +@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see + #include "ipa-utils.h" + #include "cfgexpand.h" + #include "gimplify.h" ++#include "attribs.h" + + /* Cached node/edge growths. */ + fast_call_summary *edge_growth_cache = NULL; +@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time) + hints = estimates.hints; + } + +- /* When we have profile feedback, we can quite safely identify hot +- edges and for those we disable size limits. Don't do that when +- probability that caller will call the callee is low however, since it ++ /* When we have profile feedback or function attribute, we can quite safely ++ identify hot edges and for those we disable size limits. Don't do that ++ when probability that caller will call the callee is low however, since it + may hurt optimization of the caller's hot path. */ +- if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p () ++ if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p () + && (edge->count.ipa ().apply_scale (2, 1) + > (edge->caller->inlined_to + ? edge->caller->inlined_to->count.ipa () + : edge->caller->count.ipa ()))) ++ || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl)) ++ != NULL ++ && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl)) ++ != NULL)) + hints |= INLINE_HINT_known_hot; + + gcc_checking_assert (size >= 0); +diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c +new file mode 100644 +index 00000000000..1f3be641c6d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c +@@ -0,0 +1,47 @@ ++/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp" } */ ++/* { dg-add-options bind_pic_locally } */ ++ ++#define size_t long long int ++ ++struct A ++{ ++ size_t f1, f2, f3, f4; ++}; ++struct C ++{ ++ struct A a; ++ size_t b; ++}; ++struct C x; ++ ++__attribute__((hot)) struct C callee (struct A *a, struct C *c) ++{ ++ c->a=(*a); ++ ++ if((c->b + 7) & 17) ++ { ++ c->a.f1 = c->a.f2 + c->a.f1; ++ c->a.f2 = c->a.f3 - c->a.f2; ++ c->a.f3 = c->a.f2 + c->a.f3; ++ c->a.f4 = c->a.f2 - c->a.f4; ++ c->b = c->a.f2; ++ ++ } ++ return *c; ++} ++ ++__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c) ++{ ++ struct A a; ++ a.f1 = 1 + d; ++ a.f2 = e; ++ a.f3 = 12 + f; ++ a.f4 = 68 + g; ++ if (c->b > 0) ++ return callee (&a, c); ++ else ++ return *c; ++} ++ ++/* { dg-final { scan-ipa-dump "known_hot" "inline" } } */ ++ +-- +2.18.2 + diff --git a/gcc.spec b/gcc.spec index 562650f..15b721f 100644 --- a/gcc.spec +++ b/gcc.spec @@ -391,6 +391,7 @@ Patch3018: 0021-libstdc++-disable-tests.patch Patch3019: 0022-libstdc++-revert-behavior.patch Patch3020: gcc12-testsuite-typo.patch Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch +Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -802,6 +803,7 @@ cd .. %endif %patch3020 -p1 -b .typo %patch5000 -p1 -b .dts-test-20~ +%patch5001 -p1 -b .dts-test-22~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -2998,6 +3000,7 @@ fi %changelog * Wed Jul 17 2024 Haochen Jiang 12.2.1-7.6.0.1 - i386: Add syscall to enable AMX for latest kernels +- Add attribute hot judgement for INLINE_HINT_known_hot hint * Wed Apr 3 2024 Marek Polacek 12.2.1-7.6 - bump NVR (RHEL-31253) -- Gitee From f800c4e779e38e12a039c07f6fe0e3170e02eec7 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Tue, 29 Nov 2022 14:18:12 +0800 Subject: [PATCH 4/6] Enable small loop unrolling for O2 Modern processors has multiple way instruction decoders For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 instructions (usually has 3 uops with a cmp/jmp pair that can be macro-fused), the decoder would have 2 uops bubble for each iteration and the pipeline could not be fully utilized. Therefore, this patch enables loop unrolling for small size loop at O2 to fullfill the decoder as much as possible. It turns on rtl loop unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. In x86 backend the default behavior is to unroll small loops with less than 4 insns by 1 time. This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with 0.9% codesize increment. For other benchmarks the variants are minor and overall codesize increased by 0.2%. The kernel image size increased by 0.06%, and no impact on eembc. gcc/ChangeLog: * common/config/i386/i386-common.cc (ix86_optimization_table): Enable small loop unroll at O2 by default. * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll factor if -munroll-only-small-loops enabled and -funroll-loops/ -funroll-all-loops are disabled. * config/i386/i386.h (struct processor_costs): Add 2 field small_unroll_ninsns and small_unroll_factor. * config/i386/i386.opt: Add -munroll-only-small-loops. * doc/invoke.texi: Document -munroll-only-small-loops. * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl loop unrolling for -O2-speed and above if target hook loop_unroll_adjust exists. (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag when target hook loop_unroll_adjust exists. * config/i386/x86-tune-costs.h: Update all processor costs with small_unroll_ninsns = 4 and small_unroll_factor = 2. gcc/testsuite/ChangeLog: * gcc.dg/guality/loop-1.c: Add additional option -mno-unroll-only-small-loops. * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. * gcc.target/i386/pr93002.c: Likewise. url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=071e428c24ee8c1ed062597a093708bba29509c9 --- 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ++++++++++++++++++ gcc.spec | 3 + 2 files changed, 484 insertions(+) create mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch new file mode 100644 index 0000000..b16171b --- /dev/null +++ b/0026-Enable-small-loop-unrolling-for-O2.patch @@ -0,0 +1,481 @@ +From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Thu, 8 Sep 2022 16:52:02 +0800 +Subject: [PATCH 4/5] Enable small loop unrolling for O2 + +Modern processors has multiple way instruction decoders +For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 +instructions (usually has 3 uops with a cmp/jmp pair that can be +macro-fused), the decoder would have 2 uops bubble for each iteration +and the pipeline could not be fully utilized. + +Therefore, this patch enables loop unrolling for small size loop at O2 +to fullfill the decoder as much as possible. It turns on rtl loop +unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. +In x86 backend the default behavior is to unroll small loops with less +than 4 insns by 1 time. + +This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with +0.9% codesize increment. For other benchmarks the variants are minor +and overall codesize increased by 0.2%. + +The kernel image size increased by 0.06%, and no impact on eembc. + +gcc/ChangeLog: + + * common/config/i386/i386-common.cc (ix86_optimization_table): + Enable small loop unroll at O2 by default. + * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll + factor if -munroll-only-small-loops enabled and -funroll-loops/ + -funroll-all-loops are disabled. + * config/i386/i386.h (struct processor_costs): Add 2 field + small_unroll_ninsns and small_unroll_factor. + * config/i386/i386.opt: Add -munroll-only-small-loops. + * doc/invoke.texi: Document -munroll-only-small-loops. + * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl + loop unrolling for -O2-speed and above if target hook + loop_unroll_adjust exists. + (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag + when target hook loop_unroll_adjust exists. + * config/i386/x86-tune-costs.h: Update all processor costs + with small_unroll_ninsns = 4 and small_unroll_factor = 2. + +gcc/testsuite/ChangeLog: + + * gcc.dg/guality/loop-1.c: Add additional option + -mno-unroll-only-small-loops. + * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. + * gcc.target/i386/pr93002.c: Likewise. +--- + gcc/common/config/i386/i386-common.cc | 1 + + gcc/config/i386/i386.cc | 18 ++++++++ + gcc/config/i386/i386.h | 5 +++ + gcc/config/i386/i386.opt | 4 ++ + gcc/config/i386/x86-tune-costs.h | 56 +++++++++++++++++++++++++ + gcc/doc/invoke.texi | 11 ++++- + gcc/loop-init.cc | 10 +++-- + gcc/testsuite/gcc.dg/guality/loop-1.c | 2 + + gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- + gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- + 10 files changed, 105 insertions(+), 6 deletions(-) + +diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc +index 07fdd045f30..e1c1fb07d8a 100644 +--- a/gcc/common/config/i386/i386-common.cc ++++ b/gcc/common/config/i386/i386-common.cc +@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] = + /* The STC algorithm produces the smallest code at -Os, for x86. */ + { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, + REORDER_BLOCKS_ALGORITHM_STC }, ++ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, + /* Turn off -fschedule-insns by default. It tends to make the + problem with not enough registers even worse. */ + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index b16df5b183e..39b2468799c 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) + unsigned i; + unsigned mem_count = 0; + ++ /* Unroll small size loop when unroll factor is not explicitly ++ specified. */ ++ if (!(flag_unroll_loops ++ || flag_unroll_all_loops ++ || loop->unroll)) ++ { ++ nunroll = 1; ++ ++ /* Any explicit -f{no-}unroll-{all-}loops turns off ++ -munroll-only-small-loops. */ ++ if (ix86_unroll_only_small_loops ++ && !OPTION_SET_P (flag_unroll_loops) ++ && loop->ninsns <= ix86_cost->small_unroll_ninsns) ++ nunroll = ix86_cost->small_unroll_factor; ++ ++ return nunroll; ++ } ++ + if (!TARGET_ADJUST_UNROLL) + return nunroll; + +diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h +index a61c32b8957..421801111a7 100644 +--- a/gcc/config/i386/i386.h ++++ b/gcc/config/i386/i386.h +@@ -219,6 +219,11 @@ struct processor_costs { + const char *const align_jump; /* Jump alignment. */ + const char *const align_label; /* Label alignment. */ + const char *const align_func; /* Function alignment. */ ++ ++ const unsigned small_unroll_ninsns; /* Insn count limit for small loop ++ to be unrolled. */ ++ const unsigned small_unroll_factor; /* Unroll factor for small loop to ++ be unrolled. */ + }; + + extern const struct processor_costs *ix86_cost; +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index a6b0e28f238..3d369647bf7 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols. + -param=x86-stlf-window-ninsns= + Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param + Instructions number above which STFL stall penalty can be compensated. ++ ++munroll-only-small-loops ++Target Var(ix86_unroll_only_small_loops) Init(0) Save ++Enable conservative small loop unrolling. +diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h +index 017ffa69958..b4303e4e971 100644 +--- a/gcc/config/i386/x86-tune-costs.h ++++ b/gcc/config/i386/x86-tune-costs.h +@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* Processor costs (relative to an add) */ +@@ -244,6 +246,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ + "4", /* Jump alignment. */ + NULL, /* Label alignment. */ + "4", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs i486_memcpy[2] = { +@@ -354,6 +358,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs pentium_memcpy[2] = { +@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static const +@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes +@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs geode_memcpy[2] = { +@@ -786,6 +798,8 @@ struct processor_costs geode_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs k6_memcpy[2] = { +@@ -896,6 +910,8 @@ struct processor_costs k6_cost = { + "32:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "32", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* For some reason, Athlon deals better with REP prefix (relative to loops) +@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* K8 has optimized REP instruction for medium sized blocks, but for very +@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for +@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = { + "32:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "32", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* BDVER has optimized REP instruction for medium sized blocks, but for +@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + +@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* ZNVER2 has optimized REP instruction for medium sized blocks, but for +@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + struct processor_costs znver3_cost = { +@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ +@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* icelake_cost should produce code tuned for Icelake family of CPUs. +@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* alderlake_cost should produce code tuned for alderlake family of CPUs. */ +@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* BTVER1 has optimized REP instruction for medium sized blocks, but for +@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs btver2_memcpy[2] = { +@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs pentium4_memcpy[2] = { +@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs nocona_memcpy[2] = { +@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs atom_memcpy[2] = { +@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs slm_memcpy[2] = { +@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs tremont_memcpy[2] = { +@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs intel_memcpy[2] = { +@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* Generic should produce code tuned for Core-i7 (and newer chips) +@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* core_cost should produce code tuned for Core familly of CPUs. */ +@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 9ac7f89ebb1..1961cafa2bb 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options. + -mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol + -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol + -mindirect-branch-register -mharden-sls=@var{choice} @gol +--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access} ++-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol ++-munroll-only-small-loops} + + @emph{x86 Windows Options} + @gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol +@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols. The default is + @option{-mno-direct-extern-access} and executable compiled with + @option{-mdirect-extern-access} may not be binary compatible if + protected symbols are used in shared libraries and executable. ++ ++@item -munroll-only-small-loops ++@opindex munroll-only-small-loops ++@opindex mno-unroll-only-small-loops ++Controls conservative small loop unrolling. It is default enabled by ++O2, and unrolls loop with less than 4 insns by 1 time. Explicit ++-f[no-]unroll-[all-]loops would disable this flag to avoid any ++unintended unrolling behavior that user does not want. + @end table + + @node x86 Windows Options +diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc +index 1e4f6cfd7fb..84336865ef7 100644 +--- a/gcc/loop-init.cc ++++ b/gcc/loop-init.cc +@@ -565,9 +565,12 @@ public: + {} + + /* opt_pass methods: */ +- virtual bool gate (function *) ++ virtual bool gate (function * fun) + { +- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll); ++ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll ++ || (targetm.loop_unroll_adjust ++ && optimize >= 2 ++ && optimize_function_for_speed_p (fun))); + } + + virtual unsigned int execute (function *); +@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun) + if (dump_file) + df_dump (dump_file); + +- if (flag_unroll_loops) ++ if (flag_unroll_loops ++ || targetm.loop_unroll_adjust) + flags |= UAP_UNROLL; + if (flag_unroll_all_loops) + flags |= UAP_UNROLL_ALL; +diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c +index 1b1f6d32271..a32ea445a3f 100644 +--- a/gcc/testsuite/gcc.dg/guality/loop-1.c ++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c +@@ -1,5 +1,7 @@ + /* { dg-do run } */ + /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ ++/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ ++ + + #include "../nop.h" + +diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c +index 81841ef5bd7..cbc9fbb0450 100644 +--- a/gcc/testsuite/gcc.target/i386/pr86270.c ++++ b/gcc/testsuite/gcc.target/i386/pr86270.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2" } */ ++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ + + int *a; + long len; +diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c +index 0248fcc00a5..f75a847f75d 100644 +--- a/gcc/testsuite/gcc.target/i386/pr93002.c ++++ b/gcc/testsuite/gcc.target/i386/pr93002.c +@@ -1,6 +1,6 @@ + /* PR target/93002 */ + /* { dg-do compile } */ +-/* { dg-options "-O2" } */ ++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ + /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ + + volatile int sink; +-- +2.18.2 + diff --git a/gcc.spec b/gcc.spec index 15b721f..1771eef 100644 --- a/gcc.spec +++ b/gcc.spec @@ -392,6 +392,7 @@ Patch3019: 0022-libstdc++-revert-behavior.patch Patch3020: gcc12-testsuite-typo.patch Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch +Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -804,6 +805,7 @@ cd .. %patch3020 -p1 -b .typo %patch5000 -p1 -b .dts-test-20~ %patch5001 -p1 -b .dts-test-22~ +%patch5002 -p1 -b .dts-test-23~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -3001,6 +3003,7 @@ fi * Wed Jul 17 2024 Haochen Jiang 12.2.1-7.6.0.1 - i386: Add syscall to enable AMX for latest kernels - Add attribute hot judgement for INLINE_HINT_known_hot hint +- Enable small loop unrolling for O2 * Wed Apr 3 2024 Marek Polacek 12.2.1-7.6 - bump NVR (RHEL-31253) -- Gitee From 08b519110d055e6bd36f7647e30bf86d8d65cef8 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Tue, 29 Nov 2022 14:22:58 +0800 Subject: [PATCH 5/6] i386: Only enable small loop unrolling in backend [PR 107692] Followed by the discussion in pr107692, -munroll-only-small-loops Does not turns on/off -funroll-loops, and current check in pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take effect. Revert the change about targetm.loop_unroll_adjust and apply the backend option change to strictly follow the rule that -funroll-loops takes full control of loop unrolling, and munroll-only-small-loops just change its behavior to unroll small size loops. gcc/ChangeLog: PR target/107692 * common/config/i386/i386-common.cc (ix86_optimization_table): Enable loop unroll O2, disable -fweb and -frename-registers by default. * config/i386/i386-options.cc (ix86_override_options_after_change): Disable small loop unroll when funroll-loops enabled, reset cunroll_grow_size when it is not explicitly enabled. (ix86_option_override_internal): Call ix86_override_options_after_change instead of calling ix86_recompute_optlev_based_flags and ix86_default_align separately. * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll factor if -munroll-only-small-loops enabled. * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable loop unrolling for -O2-speed. (pass_rtl_unroll_loops::execute): Rmove targetm.loop_unroll_adjust check. gcc/testsuite/ChangeLog: PR target/107692 * gcc.dg/guality/loop-1.c: Remove additional option for ia32. * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. * gcc.target/i386/pr93002.c: Likewise. url:https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8caf155a3d6e23e47bf55068ad23c23d4655a054 --- ...-small-loop-unrolling-in-backend-PR-.patch | 231 ++++++++++++++++++ gcc.spec | 3 + 2 files changed, 234 insertions(+) create mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch new file mode 100644 index 0000000..de3995f --- /dev/null +++ b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch @@ -0,0 +1,231 @@ +From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Sat, 19 Nov 2022 09:38:00 +0800 +Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR + 107692] + +Followed by the discussion in pr107692, -munroll-only-small-loops +Does not turns on/off -funroll-loops, and current check in +pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take +effect. Revert the change about targetm.loop_unroll_adjust and apply +the backend option change to strictly follow the rule that +-funroll-loops takes full control of loop unrolling, and +munroll-only-small-loops just change its behavior to unroll small size +loops. + +gcc/ChangeLog: + + PR target/107692 + * common/config/i386/i386-common.cc (ix86_optimization_table): + Enable loop unroll O2, disable -fweb and -frename-registers + by default. + * config/i386/i386-options.cc + (ix86_override_options_after_change): + Disable small loop unroll when funroll-loops enabled, reset + cunroll_grow_size when it is not explicitly enabled. + (ix86_option_override_internal): Call + ix86_override_options_after_change instead of calling + ix86_recompute_optlev_based_flags and ix86_default_align + separately. + * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll + factor if -munroll-only-small-loops enabled. + * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable + loop unrolling for -O2-speed. + (pass_rtl_unroll_loops::execute): Rmove + targetm.loop_unroll_adjust check. + +gcc/testsuite/ChangeLog: + + PR target/107692 + * gcc.dg/guality/loop-1.c: Remove additional option for ia32. + * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. + * gcc.target/i386/pr93002.c: Likewise. +--- + gcc/common/config/i386/i386-common.cc | 8 ++++++ + gcc/config/i386/i386-options.cc | 34 ++++++++++++++++++++++--- + gcc/config/i386/i386.cc | 18 ++++--------- + gcc/loop-init.cc | 11 +++----- + gcc/testsuite/gcc.dg/guality/loop-1.c | 2 -- + gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- + gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- + 7 files changed, 49 insertions(+), 28 deletions(-) + +diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc +index e1c1fb07d8a..5e777849f91 100644 +--- a/gcc/common/config/i386/i386-common.cc ++++ b/gcc/common/config/i386/i386-common.cc +@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] = + /* The STC algorithm produces the smallest code at -Os, for x86. */ + { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, + REORDER_BLOCKS_ALGORITHM_STC }, ++ ++ /* Turn on -funroll-loops with -munroll-only-small-loops to enable small ++ loop unrolling at -O2. */ ++ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 }, + { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, ++ /* Turns off -frename-registers and -fweb which are enabled by ++ funroll-loops. */ ++ { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 }, ++ { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 }, + /* Turn off -fschedule-insns by default. It tends to make the + problem with not enough registers even worse. */ + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 32cc58a764b..b853ff55825 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts, + void + ix86_override_options_after_change (void) + { ++ /* Default align_* from the processor table. */ + ix86_default_align (&global_options); ++ + ix86_recompute_optlev_based_flags (&global_options, &global_options_set); ++ ++ /* Disable unrolling small loops when there's explicit ++ -f{,no}unroll-loop. */ ++ if ((OPTION_SET_P (flag_unroll_loops)) ++ || (OPTION_SET_P (flag_unroll_all_loops) ++ && flag_unroll_all_loops)) ++ { ++ if (!OPTION_SET_P (ix86_unroll_only_small_loops)) ++ ix86_unroll_only_small_loops = 0; ++ /* Re-enable -frename-registers and -fweb if funroll-loops ++ enabled. */ ++ if (!OPTION_SET_P (flag_web)) ++ flag_web = flag_unroll_loops; ++ if (!OPTION_SET_P (flag_rename_registers)) ++ flag_rename_registers = flag_unroll_loops; ++ /* -fcunroll-grow-size default follws -f[no]-unroll-loops. */ ++ if (!OPTION_SET_P (flag_cunroll_grow_size)) ++ flag_cunroll_grow_size = flag_unroll_loops ++ || flag_peel_loops ++ || optimize >= 3; ++ } ++ else ++ { ++ if (!OPTION_SET_P (flag_cunroll_grow_size)) ++ flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; ++ } ++ + } + + /* Clear stack slot assignments remembered from previous functions. +@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p, + + set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes); + +- ix86_recompute_optlev_based_flags (opts, opts_set); ++ ix86_override_options_after_change (); + + ix86_tune_cost = processor_cost_table[ix86_tune]; + /* TODO: ix86_cost should be chosen at instruction or function granuality +@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p, + || TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_regparm = REGPARM_MAX; + +- /* Default align_* from the processor table. */ +- ix86_default_align (opts); +- + /* Provide default for -mbranch-cost= value. */ + SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost, + ix86_tune_cost->branch_cost); +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 39b2468799c..000415c0e2e 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) + + /* Unroll small size loop when unroll factor is not explicitly + specified. */ +- if (!(flag_unroll_loops +- || flag_unroll_all_loops +- || loop->unroll)) ++ if (ix86_unroll_only_small_loops && !loop->unroll) + { +- nunroll = 1; +- +- /* Any explicit -f{no-}unroll-{all-}loops turns off +- -munroll-only-small-loops. */ +- if (ix86_unroll_only_small_loops +- && !OPTION_SET_P (flag_unroll_loops) +- && loop->ninsns <= ix86_cost->small_unroll_ninsns) +- nunroll = ix86_cost->small_unroll_factor; +- +- return nunroll; ++ if (loop->ninsns <= ix86_cost->small_unroll_ninsns) ++ return MIN (nunroll, ix86_cost->small_unroll_factor); ++ else ++ return 1; + } + + if (!TARGET_ADJUST_UNROLL) +diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc +index 84336865ef7..ed1b2f6ebab 100644 +--- a/gcc/loop-init.cc ++++ b/gcc/loop-init.cc +@@ -565,12 +565,10 @@ public: + {} + + /* opt_pass methods: */ +- virtual bool gate (function * fun) ++ virtual bool gate (function *) + { +- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll +- || (targetm.loop_unroll_adjust +- && optimize >= 2 +- && optimize_function_for_speed_p (fun))); ++ return (flag_unroll_loops || flag_unroll_all_loops ++ || cfun->has_unroll); + } + + virtual unsigned int execute (function *); +@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun) + if (dump_file) + df_dump (dump_file); + +- if (flag_unroll_loops +- || targetm.loop_unroll_adjust) ++ if (flag_unroll_loops) + flags |= UAP_UNROLL; + if (flag_unroll_all_loops) + flags |= UAP_UNROLL_ALL; +diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c +index a32ea445a3f..1b1f6d32271 100644 +--- a/gcc/testsuite/gcc.dg/guality/loop-1.c ++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c +@@ -1,7 +1,5 @@ + /* { dg-do run } */ + /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ +-/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ +- + + #include "../nop.h" + +diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c +index cbc9fbb0450..98b012caf23 100644 +--- a/gcc/testsuite/gcc.target/i386/pr86270.c ++++ b/gcc/testsuite/gcc.target/i386/pr86270.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ ++/* { dg-options "-O2 -fno-unroll-loops" } */ + + int *a; + long len; +diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c +index f75a847f75d..7e2d869e17b 100644 +--- a/gcc/testsuite/gcc.target/i386/pr93002.c ++++ b/gcc/testsuite/gcc.target/i386/pr93002.c +@@ -1,6 +1,6 @@ + /* PR target/93002 */ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ ++/* { dg-options "-O2 -fno-unroll-loops" } */ + /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ + + volatile int sink; +-- +2.18.2 + diff --git a/gcc.spec b/gcc.spec index 1771eef..05d7e9b 100644 --- a/gcc.spec +++ b/gcc.spec @@ -393,6 +393,7 @@ Patch3020: gcc12-testsuite-typo.patch Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch +Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -806,6 +807,7 @@ cd .. %patch5000 -p1 -b .dts-test-20~ %patch5001 -p1 -b .dts-test-22~ %patch5002 -p1 -b .dts-test-23~ +%patch5003 -p1 -b .dts-test-24~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -3004,6 +3006,7 @@ fi - i386: Add syscall to enable AMX for latest kernels - Add attribute hot judgement for INLINE_HINT_known_hot hint - Enable small loop unrolling for O2 +- i386: Only enable small loop unrolling in backend [PR 107692] * Wed Apr 3 2024 Marek Polacek 12.2.1-7.6 - bump NVR (RHEL-31253) -- Gitee From 04341889f8d21a8b61b79370af48be876972567d Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Wed, 14 Dec 2022 11:12:47 +0800 Subject: [PATCH 6/6] i386: Avoid fma_chain for -march=alderlake and sapphirerapids. For Alderlake there is similar issue like PR 81616, enable avoid_fma256_chain will also benefit on Intel latest platforms Alderlake and Sapphire Rapids. gcc/ChangeLog: * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add m_SAPPHIRERAPIDS, m_ALDERLAKE. --- ...hain-for-march-alderlake-and-sapphir.patch | 35 +++++++++++++++++++ gcc.spec | 3 -- 2 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch diff --git a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch new file mode 100644 index 0000000..ad65965 --- /dev/null +++ b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch @@ -0,0 +1,35 @@ +From b7980cd8d8bcf41b3ca1b6f3ba147789d42a9b99 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Tue, 6 Dec 2022 09:53:35 +0800 +Subject: [PATCH] i386: Avoid fma_chain for -march=alderlake and + sapphirerapids. + +For Alderlake there is similar issue like PR 81616, enable +avoid_fma256_chain will also benefit on Intel latest platforms +Alderlake and Sapphire Rapids. + +gcc/ChangeLog: + + * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add + m_SAPPHIRERAPIDS, m_ALDERLAKE. +--- + gcc/config/i386/x86-tune.def | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def +index d983e2f6213..1e1b206a71c 100644 +--- a/gcc/config/i386/x86-tune.def ++++ b/gcc/config/i386/x86-tune.def +@@ -485,7 +485,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) + + /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or + smaller FMA chain. */ +-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3) ++DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 ++ | m_ALDERLAKE | m_SAPPHIRERAPIDS) + + /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd + for v2df vector reduction. */ +-- +2.18.2 + diff --git a/gcc.spec b/gcc.spec index 05d7e9b..7ad9ac7 100644 --- a/gcc.spec +++ b/gcc.spec @@ -390,7 +390,6 @@ Patch3017: 0020-more-fixes.patch Patch3018: 0021-libstdc++-disable-tests.patch Patch3019: 0022-libstdc++-revert-behavior.patch Patch3020: gcc12-testsuite-typo.patch -Patch5000: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch Patch5001: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch Patch5002: 0026-Enable-small-loop-unrolling-for-O2.patch Patch5003: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch @@ -804,7 +803,6 @@ cd .. %patch3019 -p1 -b .dts-test-19~ %endif %patch3020 -p1 -b .typo -%patch5000 -p1 -b .dts-test-20~ %patch5001 -p1 -b .dts-test-22~ %patch5002 -p1 -b .dts-test-23~ %patch5003 -p1 -b .dts-test-24~ @@ -3003,7 +3001,6 @@ fi %changelog * Wed Jul 17 2024 Haochen Jiang 12.2.1-7.6.0.1 -- i386: Add syscall to enable AMX for latest kernels - Add attribute hot judgement for INLINE_HINT_known_hot hint - Enable small loop unrolling for O2 - i386: Only enable small loop unrolling in backend [PR 107692] -- Gitee