From adaddbf8157592c47565bd43579ee58e5cbb3e62 Mon Sep 17 00:00:00 2001 From: anolis-bot Date: Tue, 31 Jan 2023 14:38:37 +0800 Subject: [PATCH] update to gcc-toolset-12-gcc-12.1.1-3.4.el8_7 Signed-off-by: anolis-bot --- ...all-to-enable-AMX-for-latest-kernels.patch | 77 --- ...t-judgement-for-INLINE_HINT_known_ho.patch | 123 ----- 0026-Enable-small-loop-unrolling-for-O2.patch | 481 ------------------ ...-small-loop-unrolling-in-backend-PR-.patch | 231 --------- ...hain-for-march-alderlake-and-sapphir.patch | 35 -- dist | 2 +- gcc.spec | 29 +- ...patch => gcc12-detect-sapphirerapids.patch | 43 +- 8 files changed, 26 insertions(+), 995 deletions(-) delete mode 100644 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch delete mode 100644 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch delete mode 100644 0026-Enable-small-loop-unrolling-for-O2.patch delete mode 100644 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch delete mode 100644 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch rename 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch => gcc12-detect-sapphirerapids.patch (74%) diff --git a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch b/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch deleted file mode 100644 index 94625b5..0000000 --- a/0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 5e377d21f1f345d8b157b9bc306e02bb9bd45e01 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang -Date: Thu, 16 Jun 2022 00:15:53 -0700 -Subject: [PATCH] i386: Add syscall to enable AMX for latest kernels - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/amx-check.h (request_perm_xtile_data): - New function to check if AMX is usable and enable AMX. - (main): Run test if AMX is usable. ---- - gcc/testsuite/gcc.target/i386/amx-check.h | 30 +++++++++++++++++++++++ - 1 file changed, 30 insertions(+) - -diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h -index 434b0e59703..6fff5ff4631 100644 ---- a/gcc/testsuite/gcc.target/i386/amx-check.h -+++ b/gcc/testsuite/gcc.target/i386/amx-check.h -@@ -4,11 +4,24 @@ - #include - #include - #include -+#include -+#ifdef __linux__ -+#include -+#endif - #ifdef DEBUG - #include - #endif - #include "cpuid.h" - -+#define XFEATURE_XTILECFG 17 -+#define XFEATURE_XTILEDATA 18 -+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) -+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) -+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -+ -+#define ARCH_GET_XCOMP_PERM 0x1022 -+#define ARCH_REQ_XCOMP_PERM 0x1023 -+ - /* TODO: The tmm emulation is temporary for current - AMX implementation with no tmm regclass, should - be changed in the future. */ -@@ -44,6 +57,20 @@ typedef struct __tile - /* Stride (colum width in byte) used for tileload/store */ - #define _STRIDE 64 - -+#ifdef __linux__ -+/* We need syscall to use amx functions */ -+int request_perm_xtile_data() -+{ -+ unsigned long bitmask; -+ -+ if (syscall (SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA) || -+ syscall (SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask)) -+ return 0; -+ -+ return (bitmask & XFEATURE_MASK_XTILE) != 0; -+} -+#endif -+ - /* Initialize tile config by setting all tmm size to 16x64 */ - void init_tile_config (__tilecfg_u *dst) - { -@@ -185,6 +212,9 @@ main () - #endif - #ifdef AMX_BF16 - && __builtin_cpu_supports ("amx-bf16") -+#endif -+#ifdef __linux__ -+ && request_perm_xtile_data () - #endif - ) - { --- -2.18.2 - diff --git a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch deleted file mode 100644 index 3e70f0c..0000000 --- a/0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 1b9a5cc9ec08e9f239dd2096edcc447b7a72f64a Mon Sep 17 00:00:00 2001 -From: "Cui,Lili" -Date: Tue, 1 Nov 2022 09:16:49 +0800 -Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot hint. - -We set up INLINE_HINT_known_hot hint only when we have profile feedback, -now add function attribute judgement for it, when both caller and callee -have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint -for it. - -With this patch applied, -ADL Multi-copy: 538.imagic_r 16.7% -ICX Multi-copy: 538.imagic_r 15.2% -CLX Multi-copy: 538.imagic_r 12.7% -Znver3 Multi-copy: 538.imagic_r 10.6% -Arm Multi-copy: 538.imagic_r 13.4% - -gcc/ChangeLog - - * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute - judgement for INLINE_HINT_known_hot hint. - -gcc/testsuite/ChangeLog: - - * gcc.dg/ipa/inlinehint-6.c: New test. ---- - gcc/ipa-inline-analysis.cc | 13 ++++--- - gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++ - 2 files changed, 56 insertions(+), 4 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c - -diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc -index 1ca685d1b0e..7bd29c36590 100644 ---- a/gcc/ipa-inline-analysis.cc -+++ b/gcc/ipa-inline-analysis.cc -@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see - #include "ipa-utils.h" - #include "cfgexpand.h" - #include "gimplify.h" -+#include "attribs.h" - - /* Cached node/edge growths. */ - fast_call_summary *edge_growth_cache = NULL; -@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time) - hints = estimates.hints; - } - -- /* When we have profile feedback, we can quite safely identify hot -- edges and for those we disable size limits. Don't do that when -- probability that caller will call the callee is low however, since it -+ /* When we have profile feedback or function attribute, we can quite safely -+ identify hot edges and for those we disable size limits. Don't do that -+ when probability that caller will call the callee is low however, since it - may hurt optimization of the caller's hot path. */ -- if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p () -+ if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p () - && (edge->count.ipa ().apply_scale (2, 1) - > (edge->caller->inlined_to - ? edge->caller->inlined_to->count.ipa () - : edge->caller->count.ipa ()))) -+ || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl)) -+ != NULL -+ && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl)) -+ != NULL)) - hints |= INLINE_HINT_known_hot; - - gcc_checking_assert (size >= 0); -diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c -new file mode 100644 -index 00000000000..1f3be641c6d ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c -@@ -0,0 +1,47 @@ -+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp" } */ -+/* { dg-add-options bind_pic_locally } */ -+ -+#define size_t long long int -+ -+struct A -+{ -+ size_t f1, f2, f3, f4; -+}; -+struct C -+{ -+ struct A a; -+ size_t b; -+}; -+struct C x; -+ -+__attribute__((hot)) struct C callee (struct A *a, struct C *c) -+{ -+ c->a=(*a); -+ -+ if((c->b + 7) & 17) -+ { -+ c->a.f1 = c->a.f2 + c->a.f1; -+ c->a.f2 = c->a.f3 - c->a.f2; -+ c->a.f3 = c->a.f2 + c->a.f3; -+ c->a.f4 = c->a.f2 - c->a.f4; -+ c->b = c->a.f2; -+ -+ } -+ return *c; -+} -+ -+__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c) -+{ -+ struct A a; -+ a.f1 = 1 + d; -+ a.f2 = e; -+ a.f3 = 12 + f; -+ a.f4 = 68 + g; -+ if (c->b > 0) -+ return callee (&a, c); -+ else -+ return *c; -+} -+ -+/* { dg-final { scan-ipa-dump "known_hot" "inline" } } */ -+ --- -2.18.2 - diff --git a/0026-Enable-small-loop-unrolling-for-O2.patch b/0026-Enable-small-loop-unrolling-for-O2.patch deleted file mode 100644 index b16171b..0000000 --- a/0026-Enable-small-loop-unrolling-for-O2.patch +++ /dev/null @@ -1,481 +0,0 @@ -From 6c977a4e458eab0dd7684b143baf72240b96fda8 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Thu, 8 Sep 2022 16:52:02 +0800 -Subject: [PATCH 4/5] Enable small loop unrolling for O2 - -Modern processors has multiple way instruction decoders -For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 -instructions (usually has 3 uops with a cmp/jmp pair that can be -macro-fused), the decoder would have 2 uops bubble for each iteration -and the pipeline could not be fully utilized. - -Therefore, this patch enables loop unrolling for small size loop at O2 -to fullfill the decoder as much as possible. It turns on rtl loop -unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. -In x86 backend the default behavior is to unroll small loops with less -than 4 insns by 1 time. - -This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with -0.9% codesize increment. For other benchmarks the variants are minor -and overall codesize increased by 0.2%. - -The kernel image size increased by 0.06%, and no impact on eembc. - -gcc/ChangeLog: - - * common/config/i386/i386-common.cc (ix86_optimization_table): - Enable small loop unroll at O2 by default. - * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll - factor if -munroll-only-small-loops enabled and -funroll-loops/ - -funroll-all-loops are disabled. - * config/i386/i386.h (struct processor_costs): Add 2 field - small_unroll_ninsns and small_unroll_factor. - * config/i386/i386.opt: Add -munroll-only-small-loops. - * doc/invoke.texi: Document -munroll-only-small-loops. - * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl - loop unrolling for -O2-speed and above if target hook - loop_unroll_adjust exists. - (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag - when target hook loop_unroll_adjust exists. - * config/i386/x86-tune-costs.h: Update all processor costs - with small_unroll_ninsns = 4 and small_unroll_factor = 2. - -gcc/testsuite/ChangeLog: - - * gcc.dg/guality/loop-1.c: Add additional option - -mno-unroll-only-small-loops. - * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. - * gcc.target/i386/pr93002.c: Likewise. ---- - gcc/common/config/i386/i386-common.cc | 1 + - gcc/config/i386/i386.cc | 18 ++++++++ - gcc/config/i386/i386.h | 5 +++ - gcc/config/i386/i386.opt | 4 ++ - gcc/config/i386/x86-tune-costs.h | 56 +++++++++++++++++++++++++ - gcc/doc/invoke.texi | 11 ++++- - gcc/loop-init.cc | 10 +++-- - gcc/testsuite/gcc.dg/guality/loop-1.c | 2 + - gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- - gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- - 10 files changed, 105 insertions(+), 6 deletions(-) - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 07fdd045f30..e1c1fb07d8a 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] = - /* The STC algorithm produces the smallest code at -Os, for x86. */ - { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, - REORDER_BLOCKS_ALGORITHM_STC }, -+ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, - /* Turn off -fschedule-insns by default. It tends to make the - problem with not enough registers even worse. */ - { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index b16df5b183e..39b2468799c 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -23561,6 +23561,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) - unsigned i; - unsigned mem_count = 0; - -+ /* Unroll small size loop when unroll factor is not explicitly -+ specified. */ -+ if (!(flag_unroll_loops -+ || flag_unroll_all_loops -+ || loop->unroll)) -+ { -+ nunroll = 1; -+ -+ /* Any explicit -f{no-}unroll-{all-}loops turns off -+ -munroll-only-small-loops. */ -+ if (ix86_unroll_only_small_loops -+ && !OPTION_SET_P (flag_unroll_loops) -+ && loop->ninsns <= ix86_cost->small_unroll_ninsns) -+ nunroll = ix86_cost->small_unroll_factor; -+ -+ return nunroll; -+ } -+ - if (!TARGET_ADJUST_UNROLL) - return nunroll; - -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index a61c32b8957..421801111a7 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -219,6 +219,11 @@ struct processor_costs { - const char *const align_jump; /* Jump alignment. */ - const char *const align_label; /* Label alignment. */ - const char *const align_func; /* Function alignment. */ -+ -+ const unsigned small_unroll_ninsns; /* Insn count limit for small loop -+ to be unrolled. */ -+ const unsigned small_unroll_factor; /* Unroll factor for small loop to -+ be unrolled. */ - }; - - extern const struct processor_costs *ix86_cost; -diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt -index a6b0e28f238..3d369647bf7 100644 ---- a/gcc/config/i386/i386.opt -+++ b/gcc/config/i386/i386.opt -@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols. - -param=x86-stlf-window-ninsns= - Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param - Instructions number above which STFL stall penalty can be compensated. -+ -+munroll-only-small-loops -+Target Var(ix86_unroll_only_small_loops) Init(0) Save -+Enable conservative small loop unrolling. -diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h -index 017ffa69958..b4303e4e971 100644 ---- a/gcc/config/i386/x86-tune-costs.h -+++ b/gcc/config/i386/x86-tune-costs.h -@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* Processor costs (relative to an add) */ -@@ -244,6 +246,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ - "4", /* Jump alignment. */ - NULL, /* Label alignment. */ - "4", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs i486_memcpy[2] = { -@@ -354,6 +358,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs pentium_memcpy[2] = { -@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static const -@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes -@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs geode_memcpy[2] = { -@@ -786,6 +798,8 @@ struct processor_costs geode_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs k6_memcpy[2] = { -@@ -896,6 +910,8 @@ struct processor_costs k6_cost = { - "32:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "32", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* For some reason, Athlon deals better with REP prefix (relative to loops) -@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* K8 has optimized REP instruction for medium sized blocks, but for very -@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for -@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = { - "32:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "32", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* BDVER has optimized REP instruction for medium sized blocks, but for -@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - -@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* ZNVER2 has optimized REP instruction for medium sized blocks, but for -@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - struct processor_costs znver3_cost = { -@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = { - "16", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ -@@ -1942,6 +1972,8 @@ struct processor_costs skylake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* icelake_cost should produce code tuned for Icelake family of CPUs. -@@ -2068,6 +2100,8 @@ struct processor_costs icelake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* alderlake_cost should produce code tuned for alderlake family of CPUs. */ -@@ -2188,6 +2222,8 @@ struct processor_costs alderlake_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* BTVER1 has optimized REP instruction for medium sized blocks, but for -@@ -2301,6 +2337,8 @@ const struct processor_costs btver1_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs btver2_memcpy[2] = { -@@ -2411,6 +2449,8 @@ const struct processor_costs btver2_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "11", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs pentium4_memcpy[2] = { -@@ -2520,6 +2560,8 @@ struct processor_costs pentium4_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs nocona_memcpy[2] = { -@@ -2632,6 +2674,8 @@ struct processor_costs nocona_cost = { - NULL, /* Jump alignment. */ - NULL, /* Label alignment. */ - NULL, /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs atom_memcpy[2] = { -@@ -2742,6 +2786,8 @@ struct processor_costs atom_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs slm_memcpy[2] = { -@@ -2852,6 +2898,8 @@ struct processor_costs slm_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs tremont_memcpy[2] = { -@@ -2976,6 +3024,8 @@ struct processor_costs tremont_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - static stringop_algs intel_memcpy[2] = { -@@ -3086,6 +3136,8 @@ struct processor_costs intel_cost = { - "16:8:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* Generic should produce code tuned for Core-i7 (and newer chips) -@@ -3205,6 +3257,8 @@ struct processor_costs generic_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - - /* core_cost should produce code tuned for Core familly of CPUs. */ -@@ -3331,5 +3385,7 @@ struct processor_costs core_cost = { - "16:11:8", /* Jump alignment. */ - "0:0:8", /* Label alignment. */ - "16", /* Func alignment. */ -+ 4, /* Small unroll limit. */ -+ 2, /* Small unroll factor. */ - }; - -diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index 9ac7f89ebb1..1961cafa2bb 100644 ---- a/gcc/doc/invoke.texi -+++ b/gcc/doc/invoke.texi -@@ -1448,7 +1448,8 @@ See RS/6000 and PowerPC Options. - -mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol - -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol - -mindirect-branch-register -mharden-sls=@var{choice} @gol ---mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access} -+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol -+-munroll-only-small-loops} - - @emph{x86 Windows Options} - @gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol -@@ -33157,6 +33158,14 @@ treat access to protected symbols as local symbols. The default is - @option{-mno-direct-extern-access} and executable compiled with - @option{-mdirect-extern-access} may not be binary compatible if - protected symbols are used in shared libraries and executable. -+ -+@item -munroll-only-small-loops -+@opindex munroll-only-small-loops -+@opindex mno-unroll-only-small-loops -+Controls conservative small loop unrolling. It is default enabled by -+O2, and unrolls loop with less than 4 insns by 1 time. Explicit -+-f[no-]unroll-[all-]loops would disable this flag to avoid any -+unintended unrolling behavior that user does not want. - @end table - - @node x86 Windows Options -diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc -index 1e4f6cfd7fb..84336865ef7 100644 ---- a/gcc/loop-init.cc -+++ b/gcc/loop-init.cc -@@ -565,9 +565,12 @@ public: - {} - - /* opt_pass methods: */ -- virtual bool gate (function *) -+ virtual bool gate (function * fun) - { -- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll); -+ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll -+ || (targetm.loop_unroll_adjust -+ && optimize >= 2 -+ && optimize_function_for_speed_p (fun))); - } - - virtual unsigned int execute (function *); -@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun) - if (dump_file) - df_dump (dump_file); - -- if (flag_unroll_loops) -+ if (flag_unroll_loops -+ || targetm.loop_unroll_adjust) - flags |= UAP_UNROLL; - if (flag_unroll_all_loops) - flags |= UAP_UNROLL_ALL; -diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c -index 1b1f6d32271..a32ea445a3f 100644 ---- a/gcc/testsuite/gcc.dg/guality/loop-1.c -+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c -@@ -1,5 +1,7 @@ - /* { dg-do run } */ - /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ -+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ -+ - - #include "../nop.h" - -diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c -index 81841ef5bd7..cbc9fbb0450 100644 ---- a/gcc/testsuite/gcc.target/i386/pr86270.c -+++ b/gcc/testsuite/gcc.target/i386/pr86270.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O2" } */ -+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ - - int *a; - long len; -diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c -index 0248fcc00a5..f75a847f75d 100644 ---- a/gcc/testsuite/gcc.target/i386/pr93002.c -+++ b/gcc/testsuite/gcc.target/i386/pr93002.c -@@ -1,6 +1,6 @@ - /* PR target/93002 */ - /* { dg-do compile } */ --/* { dg-options "-O2" } */ -+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ - /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ - - volatile int sink; --- -2.18.2 - diff --git a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch deleted file mode 100644 index de3995f..0000000 --- a/0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch +++ /dev/null @@ -1,231 +0,0 @@ -From 5c07825ca0c34dd946a8cfc0325ddb452d7f65c5 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Sat, 19 Nov 2022 09:38:00 +0800 -Subject: [PATCH 5/5] i386: Only enable small loop unrolling in backend [PR - 107692] - -Followed by the discussion in pr107692, -munroll-only-small-loops -Does not turns on/off -funroll-loops, and current check in -pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take -effect. Revert the change about targetm.loop_unroll_adjust and apply -the backend option change to strictly follow the rule that --funroll-loops takes full control of loop unrolling, and -munroll-only-small-loops just change its behavior to unroll small size -loops. - -gcc/ChangeLog: - - PR target/107692 - * common/config/i386/i386-common.cc (ix86_optimization_table): - Enable loop unroll O2, disable -fweb and -frename-registers - by default. - * config/i386/i386-options.cc - (ix86_override_options_after_change): - Disable small loop unroll when funroll-loops enabled, reset - cunroll_grow_size when it is not explicitly enabled. - (ix86_option_override_internal): Call - ix86_override_options_after_change instead of calling - ix86_recompute_optlev_based_flags and ix86_default_align - separately. - * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll - factor if -munroll-only-small-loops enabled. - * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable - loop unrolling for -O2-speed. - (pass_rtl_unroll_loops::execute): Rmove - targetm.loop_unroll_adjust check. - -gcc/testsuite/ChangeLog: - - PR target/107692 - * gcc.dg/guality/loop-1.c: Remove additional option for ia32. - * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. - * gcc.target/i386/pr93002.c: Likewise. ---- - gcc/common/config/i386/i386-common.cc | 8 ++++++ - gcc/config/i386/i386-options.cc | 34 ++++++++++++++++++++++--- - gcc/config/i386/i386.cc | 18 ++++--------- - gcc/loop-init.cc | 11 +++----- - gcc/testsuite/gcc.dg/guality/loop-1.c | 2 -- - gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- - gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- - 7 files changed, 49 insertions(+), 28 deletions(-) - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index e1c1fb07d8a..5e777849f91 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] = - /* The STC algorithm produces the smallest code at -Os, for x86. */ - { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, - REORDER_BLOCKS_ALGORITHM_STC }, -+ -+ /* Turn on -funroll-loops with -munroll-only-small-loops to enable small -+ loop unrolling at -O2. */ -+ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 }, - { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, -+ /* Turns off -frename-registers and -fweb which are enabled by -+ funroll-loops. */ -+ { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 }, -+ { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 }, - /* Turn off -fschedule-insns by default. It tends to make the - problem with not enough registers even worse. */ - { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 32cc58a764b..b853ff55825 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts, - void - ix86_override_options_after_change (void) - { -+ /* Default align_* from the processor table. */ - ix86_default_align (&global_options); -+ - ix86_recompute_optlev_based_flags (&global_options, &global_options_set); -+ -+ /* Disable unrolling small loops when there's explicit -+ -f{,no}unroll-loop. */ -+ if ((OPTION_SET_P (flag_unroll_loops)) -+ || (OPTION_SET_P (flag_unroll_all_loops) -+ && flag_unroll_all_loops)) -+ { -+ if (!OPTION_SET_P (ix86_unroll_only_small_loops)) -+ ix86_unroll_only_small_loops = 0; -+ /* Re-enable -frename-registers and -fweb if funroll-loops -+ enabled. */ -+ if (!OPTION_SET_P (flag_web)) -+ flag_web = flag_unroll_loops; -+ if (!OPTION_SET_P (flag_rename_registers)) -+ flag_rename_registers = flag_unroll_loops; -+ /* -fcunroll-grow-size default follws -f[no]-unroll-loops. */ -+ if (!OPTION_SET_P (flag_cunroll_grow_size)) -+ flag_cunroll_grow_size = flag_unroll_loops -+ || flag_peel_loops -+ || optimize >= 3; -+ } -+ else -+ { -+ if (!OPTION_SET_P (flag_cunroll_grow_size)) -+ flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; -+ } -+ - } - - /* Clear stack slot assignments remembered from previous functions. -@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p, - - set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes); - -- ix86_recompute_optlev_based_flags (opts, opts_set); -+ ix86_override_options_after_change (); - - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality -@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p, - || TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_regparm = REGPARM_MAX; - -- /* Default align_* from the processor table. */ -- ix86_default_align (opts); -- - /* Provide default for -mbranch-cost= value. */ - SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost, - ix86_tune_cost->branch_cost); -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index 39b2468799c..000415c0e2e 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -23563,20 +23563,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) - - /* Unroll small size loop when unroll factor is not explicitly - specified. */ -- if (!(flag_unroll_loops -- || flag_unroll_all_loops -- || loop->unroll)) -+ if (ix86_unroll_only_small_loops && !loop->unroll) - { -- nunroll = 1; -- -- /* Any explicit -f{no-}unroll-{all-}loops turns off -- -munroll-only-small-loops. */ -- if (ix86_unroll_only_small_loops -- && !OPTION_SET_P (flag_unroll_loops) -- && loop->ninsns <= ix86_cost->small_unroll_ninsns) -- nunroll = ix86_cost->small_unroll_factor; -- -- return nunroll; -+ if (loop->ninsns <= ix86_cost->small_unroll_ninsns) -+ return MIN (nunroll, ix86_cost->small_unroll_factor); -+ else -+ return 1; - } - - if (!TARGET_ADJUST_UNROLL) -diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc -index 84336865ef7..ed1b2f6ebab 100644 ---- a/gcc/loop-init.cc -+++ b/gcc/loop-init.cc -@@ -565,12 +565,10 @@ public: - {} - - /* opt_pass methods: */ -- virtual bool gate (function * fun) -+ virtual bool gate (function *) - { -- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll -- || (targetm.loop_unroll_adjust -- && optimize >= 2 -- && optimize_function_for_speed_p (fun))); -+ return (flag_unroll_loops || flag_unroll_all_loops -+ || cfun->has_unroll); - } - - virtual unsigned int execute (function *); -@@ -586,8 +584,7 @@ pass_rtl_unroll_loops::execute (function *fun) - if (dump_file) - df_dump (dump_file); - -- if (flag_unroll_loops -- || targetm.loop_unroll_adjust) -+ if (flag_unroll_loops) - flags |= UAP_UNROLL; - if (flag_unroll_all_loops) - flags |= UAP_UNROLL_ALL; -diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c -index a32ea445a3f..1b1f6d32271 100644 ---- a/gcc/testsuite/gcc.dg/guality/loop-1.c -+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c -@@ -1,7 +1,5 @@ - /* { dg-do run } */ - /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ --/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ -- - - #include "../nop.h" - -diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c -index cbc9fbb0450..98b012caf23 100644 ---- a/gcc/testsuite/gcc.target/i386/pr86270.c -+++ b/gcc/testsuite/gcc.target/i386/pr86270.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ -+/* { dg-options "-O2 -fno-unroll-loops" } */ - - int *a; - long len; -diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c -index f75a847f75d..7e2d869e17b 100644 ---- a/gcc/testsuite/gcc.target/i386/pr93002.c -+++ b/gcc/testsuite/gcc.target/i386/pr93002.c -@@ -1,6 +1,6 @@ - /* PR target/93002 */ - /* { dg-do compile } */ --/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ -+/* { dg-options "-O2 -fno-unroll-loops" } */ - /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ - - volatile int sink; --- -2.18.2 - diff --git a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch b/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch deleted file mode 100644 index ad65965..0000000 --- a/0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch +++ /dev/null @@ -1,35 +0,0 @@ -From b7980cd8d8bcf41b3ca1b6f3ba147789d42a9b99 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang -Date: Tue, 6 Dec 2022 09:53:35 +0800 -Subject: [PATCH] i386: Avoid fma_chain for -march=alderlake and - sapphirerapids. - -For Alderlake there is similar issue like PR 81616, enable -avoid_fma256_chain will also benefit on Intel latest platforms -Alderlake and Sapphire Rapids. - -gcc/ChangeLog: - - * config/i386/x86-tune.def (X86_TUNE_AVOID_256FMA_CHAINS): Add - m_SAPPHIRERAPIDS, m_ALDERLAKE. ---- - gcc/config/i386/x86-tune.def | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def -index d983e2f6213..1e1b206a71c 100644 ---- a/gcc/config/i386/x86-tune.def -+++ b/gcc/config/i386/x86-tune.def -@@ -485,7 +485,8 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER) - - /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or - smaller FMA chain. */ --DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3) -+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 -+ | m_ALDERLAKE | m_SAPPHIRERAPIDS) - - /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd - for v2df vector reduction. */ --- -2.18.2 - diff --git a/dist b/dist index 9c0e36e..535c690 100644 --- a/dist +++ b/dist @@ -1 +1 @@ -an8 +an8_7 diff --git a/gcc.spec b/gcc.spec index 95a3e44..b2fb923 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,4 +1,3 @@ -%define anolis_release .0.2 %global __python /usr/bin/python3 %{?scl:%global __strip %%{_scl_root}/usr/bin/strip} %{?scl:%global __objdump %%{_scl_root}/usr/bin/objdump} @@ -148,7 +147,7 @@ Summary: GCC version 12 Name: %{?scl_prefix}gcc Version: %{gcc_version} -Release: %{gcc_release}.2%{anolis_release}%{?dist} +Release: %{gcc_release}.4%{?dist} # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have # GCC Runtime Exception. License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD @@ -353,6 +352,8 @@ Patch12: gcc12-pr105551.patch Patch13: gcc12-libtsan-s390x.patch # This has been backported to GCC 12, so eventually we can drop it. Patch14: gcc12-pr105991.patch +# For DTS 12.0.z. +Patch15: gcc12-detect-sapphirerapids.patch Patch100: gcc12-fortran-fdec-duplicates.patch Patch101: gcc12-fortran-flogical-as-integer.patch @@ -387,12 +388,6 @@ Patch3016: 0019-xfails.patch Patch3017: 0020-more-fixes.patch Patch3018: 0021-libstdc++-disable-tests.patch Patch3019: 0022-libstdc++-revert-behavior.patch -Patch3020: 0023-i386-Add-syscall-to-enable-AMX-for-latest-kernels.patch -Patch3021: 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch -Patch3022: 0025-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch -Patch3023: 0026-Enable-small-loop-unrolling-for-O2.patch -Patch3024: 0027-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch -Patch3025: 0028-i386-Avoid-fma_chain-for-march-alderlake-and-sapphir.patch %if 0%{?rhel} == 9 %global nonsharedver 110 @@ -737,6 +732,7 @@ so that there cannot be any synchronization problems. %patch12 -p0 -b .pr105551~ %patch13 -p0 -b .libtsan-s390x~ %patch14 -p1 -b .pr105991~ +%patch15 -p1 -b .detect-spr~ %if 0%{?rhel} >= 6 %patch100 -p1 -b .fortran-fdec-duplicates~ @@ -798,12 +794,6 @@ cd .. %if 0%{?rhel} <= 7 %patch3019 -p1 -b .dts-test-19~ %endif -%patch3020 -p1 -b .dts-test-20~ -%patch3021 -p1 -b .dts-test-21~ -%patch3022 -p1 -b .dts-test-22~ -%patch3023 -p1 -b .dts-test-23~ -%patch3024 -p1 -b .dts-test-24~ -%patch3025 -p1 -b .dts-test-25~ find gcc/testsuite -name \*.pr96939~ | xargs rm -f @@ -2970,15 +2960,8 @@ fi %endif %changelog -* Tue Dec 27 2022 Hongyu Wang 12.1.1-3.2.0.2 -- i386: Avoid fma_chain for -march=alderlake and sapphirerapids - -* Thu Dec 22 2022 Haochen Jiang 12.1.1-3.2.0.1 -- i386: Add syscall to enable AMX for latest kernels -- Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS -- Add attribute hot judgement for INLINE_HINT_known_hot hint -- Enable small loop unrolling for O2 -- i386: Only enable small loop unrolling in backend [PR 107692] +* Thu Dec 1 2022 Marek Polacek 12.1.1-3.4 +- fix Sapphire Rapids detection in host_detect_local_cpu (#2150131) * Fri Jul 8 2022 Marek Polacek 12.1.1-3.2 - recognize PLUS and XOR forms of rldimi (PR target/105991, #2095789) diff --git a/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch b/gcc12-detect-sapphirerapids.patch similarity index 74% rename from 0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch rename to gcc12-detect-sapphirerapids.patch index 42cabc2..5b994e6 100644 --- a/0024-Remove-AVX512_VP2INTERSECT-from-PTA_SAPPHIRERAPIDS.patch +++ b/gcc12-detect-sapphirerapids.patch @@ -1,25 +1,23 @@ -From 11c72f20d4d7ba1862a257cef05dc3a5e84a276d Mon Sep 17 00:00:00 2001 -From: "Cui,Lili" -Date: Thu, 29 Sep 2022 14:28:06 +0800 -Subject: [PATCH] Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS +commit d644dfe36d9733c767af62d37250253ced6efd8c +Author: Cui,Lili +Date: Mon Nov 7 11:25:41 2022 +0800 -gcc/ChangeLog: - - * config/i386/driver-i386.cc (host_detect_local_cpu): - Move sapphirerapids out of AVX512_VP2INTERSECT. - * config/i386/i386.h: Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS - * doc/invoke.texi: Remove AVX512_VP2INTERSECT from SAPPHIRERAPIDS ---- - gcc/config/i386/driver-i386.cc | 13 +++++-------- - gcc/config/i386/i386.h | 7 +++---- - gcc/doc/invoke.texi | 8 ++++---- - 3 files changed, 12 insertions(+), 16 deletions(-) + Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS + + gcc/ChangeLog: + + * config/i386/driver-i386.cc (host_detect_local_cpu): + Move sapphirerapids out of AVX512_VP2INTERSECT. + * config/i386/i386.h: Remove AVX512_VP2INTERSECT from PTA_SAPPHIRERAPIDS + * doc/invoke.texi: Remove AVX512_VP2INTERSECT from SAPPHIRERAPIDS + + (cherry picked from commit d644dfe36d9733c767af62d37250253ced6efd8c) diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc -index 3c702fdca33..ef567045c67 100644 +index 9e0ae0b2baa..fcf23fd921d 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc -@@ -589,15 +589,12 @@ const char *host_detect_local_cpu (int argc, const char **argv) +@@ -574,15 +574,12 @@ const char *host_detect_local_cpu (int argc, const char **argv) /* This is unknown family 0x6 CPU. */ if (has_feature (FEATURE_AVX)) { @@ -41,10 +39,10 @@ index 3c702fdca33..ef567045c67 100644 else if (has_feature (FEATURE_AVX512BF16)) cpu = "cooperlake"; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index 900a3bc3673..372a2cff8fe 100644 +index 363082ba47b..a61c32b8957 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h -@@ -2326,10 +2326,9 @@ constexpr wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT +@@ -2328,10 +2328,9 @@ constexpr wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT constexpr wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT | PTA_KL | PTA_WIDEKL; constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_ICELAKE_SERVER | PTA_MOVDIRI @@ -59,10 +57,10 @@ index 900a3bc3673..372a2cff8fe 100644 | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD | PTA_PREFETCHWT1; constexpr wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE; diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index 271c8bb8468..a9ecc4426a4 100644 +index 3749e06f13e..cee057a70bf 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi -@@ -32057,11 +32057,11 @@ Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +@@ -31541,11 +31541,11 @@ Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, @@ -78,6 +76,3 @@ index 271c8bb8468..a9ecc4426a4 100644 @item alderlake Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, --- -2.18.2 - -- Gitee