diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 3235ba1e5b06d77f4f40962ef44888e62d30988c..f18e19630d9595784cdabc686470c3f3fe5a35a8 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -220,9 +220,22 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity +bool arch_enable_hybrid_capacity_scale(void); +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq); + +unsigned long arch_scale_cpu_capacity(int cpu); +#define arch_scale_cpu_capacity arch_scale_cpu_capacity + extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else +static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } +static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, + unsigned long max_cap, + unsigned long cap_freq, + unsigned long base_freq) { } + static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index fdbb5f07448fad0dd267868fdfb0a6394d9e45b9..5dd4cc7836a7b2f34a6e240782b27ab3923c7282 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -347,9 +347,89 @@ static DECLARE_WORK(disable_freq_invariance_work, DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); + +struct arch_hybrid_cpu_scale { + unsigned long capacity; + unsigned long freq_ratio; +}; + +static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; + +/** + * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling + * + * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, + * initialize it and set the static key controlling its code paths. + * + * Must be called before arch_set_cpu_capacity(). + */ +bool arch_enable_hybrid_capacity_scale(void) +{ + int cpu; + + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { + WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); + return true; + } + + arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); + if (!arch_cpu_scale) + return false; + + for_each_possible_cpu(cpu) { + per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; + per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; + } + + static_branch_enable(&arch_hybrid_cap_scale_key); + + pr_info("Hybrid CPU capacity scaling enabled\n"); + + return true; +} + +/** + * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU + * @cpu: Target CPU. + * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. + * @max_cap: System-wide maximum CPU capacity. + * @cap_freq: Frequency of @cpu corresponding to @cap. + * @base_freq: Frequency of @cpu at which MPERF counts. + * + * The units in which @cap and @max_cap are expressed do not matter, so long + * as they are consistent, because the former is effectively divided by the + * latter. Analogously for @cap_freq and @base_freq. + * + * After calling this function for all CPUs, call arch_rebuild_sched_domains() + * to let the scheduler know that capacity-aware scheduling can be used going + * forward. + */ +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq) +{ + if (static_branch_likely(&arch_hybrid_cap_scale_key)) { + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, + div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, + div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); + } else { + WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); + } +} + +unsigned long arch_scale_cpu_capacity(int cpu) +{ + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); + + return SCHED_CAPACITY_SCALE; +} +EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); + static void scale_freq_tick(u64 acnt, u64 mcnt) { - u64 freq_scale; + u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; @@ -357,7 +437,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); + else + freq_ratio = arch_max_freq_ratio; + + if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 15916b2abaab25c7de53a5bc28d39cc21b5eaceb..6dc835316085a0d4c43f1681877e6736924238d1 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -129,6 +129,20 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr); #define CPC_SUPPORTED(cpc) ((cpc)->type == ACPI_TYPE_INTEGER ? \ !!(cpc)->cpc_entry.int_value : \ !IS_NULL_REG(&(cpc)->cpc_entry.reg)) + +/* + * Each bit indicates the optionality of the register in per-cpu + * cpc_regs[] with the corresponding index. 0 means mandatory and 1 + * means optional. + */ +#define REG_OPTIONAL (0x1FC7D0) + +/* + * Use the index of the register in per-cpu cpc_regs[] to check if + * it's an optional one. + */ +#define IS_OPTIONAL_CPC_REG(reg_idx) (REG_OPTIONAL & (1U << (reg_idx))) + /* * Arbitrary Retries in case the remote processor is slow to respond * to PCC commands. Keeping it high enough to cover emulators where @@ -1178,32 +1192,44 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val) return ret_val; } -static int cppc_get_perf(int cpunum, enum cppc_regs reg_idx, u64 *perf) +static int cppc_get_reg_val(int cpu, enum cppc_regs reg_idx, u64 *val) { - struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); struct cpc_register_resource *reg; + if (val == NULL) + return -EINVAL; + if (!cpc_desc) { - pr_debug("No CPC descriptor for CPU:%d\n", cpunum); + pr_debug("No CPC descriptor for CPU:%d\n", cpu); return -ENODEV; } reg = &cpc_desc->cpc_regs[reg_idx]; + if ((reg->type == ACPI_TYPE_INTEGER && IS_OPTIONAL_CPC_REG(reg_idx) && + !reg->cpc_entry.int_value) || (reg->type != ACPI_TYPE_INTEGER && + IS_NULL_REG(®->cpc_entry.reg))) { + pr_debug("CPC register is not supported\n"); + return -EOPNOTSUPP; + } + if (CPC_IN_PCC(reg)) { - int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cppc_pcc_data *pcc_ss_data = NULL; - int ret = 0; + int ret; - if (pcc_ss_id < 0) - return -EIO; + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id\n"); + return -ENODEV; + } pcc_ss_data = pcc_data[pcc_ss_id]; down_write(&pcc_ss_data->pcc_lock); if (send_pcc_cmd(pcc_ss_id, CMD_READ) >= 0) - cpc_read(cpunum, reg, perf); + ret = cpc_read(cpu, reg, val); else ret = -EIO; @@ -1212,9 +1238,7 @@ static int cppc_get_perf(int cpunum, enum cppc_regs reg_idx, u64 *perf) return ret; } - cpc_read(cpunum, reg, perf); - - return 0; + return cpc_read(cpu, reg, val); } /** @@ -1226,7 +1250,7 @@ static int cppc_get_perf(int cpunum, enum cppc_regs reg_idx, u64 *perf) */ int cppc_get_desired_perf(int cpunum, u64 *desired_perf) { - return cppc_get_perf(cpunum, DESIRED_PERF, desired_perf); + return cppc_get_reg_val(cpunum, DESIRED_PERF, desired_perf); } EXPORT_SYMBOL_GPL(cppc_get_desired_perf); @@ -1239,7 +1263,7 @@ EXPORT_SYMBOL_GPL(cppc_get_desired_perf); */ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) { - return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); + return cppc_get_reg_val(cpunum, NOMINAL_PERF, nominal_perf); } /** @@ -1251,7 +1275,7 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) */ int cppc_get_highest_perf(int cpunum, u64 *highest_perf) { - return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf); + return cppc_get_reg_val(cpunum, HIGHEST_PERF, highest_perf); } EXPORT_SYMBOL_GPL(cppc_get_highest_perf); @@ -1264,7 +1288,7 @@ EXPORT_SYMBOL_GPL(cppc_get_highest_perf); */ int cppc_get_epp_perf(int cpunum, u64 *epp_perf) { - return cppc_get_perf(cpunum, ENERGY_PERF, epp_perf); + return cppc_get_reg_val(cpunum, ENERGY_PERF, epp_perf); } EXPORT_SYMBOL_GPL(cppc_get_epp_perf); diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index f457d4af2c62e5e7828946773fae7fcd0d15e657..32e1bdc588c52d33f57faaa2ce39d37c22621f89 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -90,7 +90,8 @@ TRACE_EVENT(amd_pstate_epp_perf, u8 epp, u8 min_perf, u8 max_perf, - bool boost + bool boost, + bool changed ), TP_ARGS(cpu_id, @@ -98,7 +99,8 @@ TRACE_EVENT(amd_pstate_epp_perf, epp, min_perf, max_perf, - boost), + boost, + changed), TP_STRUCT__entry( __field(unsigned int, cpu_id) @@ -107,6 +109,7 @@ TRACE_EVENT(amd_pstate_epp_perf, __field(u8, min_perf) __field(u8, max_perf) __field(bool, boost) + __field(bool, changed) ), TP_fast_assign( @@ -116,15 +119,17 @@ TRACE_EVENT(amd_pstate_epp_perf, __entry->min_perf = min_perf; __entry->max_perf = max_perf; __entry->boost = boost; + __entry->changed = changed; ), - TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u", + TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u, changed=%u", (unsigned int)__entry->cpu_id, (u8)__entry->min_perf, (u8)__entry->max_perf, (u8)__entry->highest_perf, (u8)__entry->epp, - (bool)__entry->boost + (bool)__entry->boost, + (bool)__entry->changed ) ); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 30411c3a22bf5dbf2e057cfbcc80a600a2cf0ecd..1693479019f35587ee2dfbde25c8c2525b6f820b 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -85,7 +85,6 @@ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; -static bool cppc_enabled; static bool amd_pstate_prefcore = true; static struct quirk_entry *quirks; @@ -228,9 +227,10 @@ static u8 shmem_get_epp(struct amd_cpudata *cpudata) return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp); } -static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, +static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { + struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; value = prev = READ_ONCE(cpudata->cppc_req_cached); @@ -242,6 +242,18 @@ static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = READ_ONCE(cpudata->perf); + + trace_amd_pstate_epp_perf(cpudata->cpu, + perf.highest_perf, + epp, + min_perf, + max_perf, + policy->boost_enabled, + value != prev); + } + if (value == prev) return 0; @@ -256,24 +268,24 @@ static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, } WRITE_ONCE(cpudata->cppc_req_cached, value); - WRITE_ONCE(cpudata->epp_cached, epp); return 0; } DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); -static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, +static inline int amd_pstate_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { - return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, + return static_call(amd_pstate_update_perf)(policy, min_perf, des_perf, max_perf, epp, fast_switch); } -static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) +static int msr_set_epp(struct cpufreq_policy *policy, u8 epp) { + struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; int ret; @@ -281,6 +293,19 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) value &= ~AMD_CPPC_EPP_PERF_MASK; value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = cpudata->perf; + + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + epp, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, + cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, + cpudata->cppc_req_cached), + policy->boost_enabled, + value != prev); + } + if (value == prev) return 0; @@ -291,7 +316,6 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) } /* update both so that msr_update_perf() can effectively check */ - WRITE_ONCE(cpudata->epp_cached, epp); WRITE_ONCE(cpudata->cppc_req_cached, value); return ret; @@ -299,17 +323,35 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); -static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp) +static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp) { - return static_call(amd_pstate_set_epp)(cpudata, epp); + return static_call(amd_pstate_set_epp)(policy, epp); } -static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) +static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { - int ret; + struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; + u8 epp_cached; + u64 value; + int ret; + - if (epp == cpudata->epp_cached) + epp_cached = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = cpudata->perf; + + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + epp, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, + cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, + cpudata->cppc_req_cached), + policy->boost_enabled, + epp != epp_cached); + } + + if (epp == epp_cached) return 0; perf_ctrls.energy_perf = epp; @@ -318,110 +360,37 @@ static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) pr_debug("failed to set energy perf value (%d)\n", ret); return ret; } - WRITE_ONCE(cpudata->epp_cached, epp); + + value = READ_ONCE(cpudata->cppc_req_cached); + value &= ~AMD_CPPC_EPP_PERF_MASK; + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + WRITE_ONCE(cpudata->cppc_req_cached, value); return ret; } -static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, - int pref_index) +static inline int msr_cppc_enable(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - u8 epp; - - if (!pref_index) - epp = cpudata->epp_default; - else - epp = epp_values[pref_index]; - - if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { - pr_debug("EPP cannot be set under performance policy\n"); - return -EBUSY; - } - - if (trace_amd_pstate_epp_perf_enabled()) { - union perf_cached perf = READ_ONCE(cpudata->perf); - - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - epp, - FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), - policy->boost_enabled); - } - - return amd_pstate_set_epp(cpudata, epp); + return wrmsrl_safe_on_cpu(policy->cpu, MSR_AMD_CPPC_ENABLE, 1); } -static inline int msr_cppc_enable(bool enable) +static int shmem_cppc_enable(struct cpufreq_policy *policy) { - int ret, cpu; - unsigned long logical_proc_id_mask = 0; - - /* - * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared. - */ - if (!enable) - return 0; - - if (enable == cppc_enabled) - return 0; - - for_each_present_cpu(cpu) { - unsigned long logical_id = topology_logical_package_id(cpu); - - if (test_bit(logical_id, &logical_proc_id_mask)) - continue; - - set_bit(logical_id, &logical_proc_id_mask); - - ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE, - enable); - if (ret) - return ret; - } - - cppc_enabled = enable; - return 0; -} - -static int shmem_cppc_enable(bool enable) -{ - int cpu, ret = 0; - struct cppc_perf_ctrls perf_ctrls; - - if (enable == cppc_enabled) - return 0; - - for_each_present_cpu(cpu) { - ret = cppc_set_enable(cpu, enable); - if (ret) - return ret; - - /* Enable autonomous mode for EPP */ - if (cppc_state == AMD_PSTATE_ACTIVE) { - /* Set desired perf as zero to allow EPP firmware control */ - perf_ctrls.desired_perf = 0; - ret = cppc_set_perf(cpu, &perf_ctrls); - if (ret) - return ret; - } - } - - cppc_enabled = enable; - return ret; + return cppc_set_enable(policy->cpu, 1); } DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable); -static inline int amd_pstate_cppc_enable(bool enable) +static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy) { - return static_call(amd_pstate_cppc_enable)(enable); + return static_call(amd_pstate_cppc_enable)(policy); } static int msr_init_perf(struct amd_cpudata *cpudata) { union perf_cached perf = READ_ONCE(cpudata->perf); - u64 cap1, numerator; + u64 cap1, numerator, cppc_req; + u8 min_perf; int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &cap1); @@ -432,6 +401,22 @@ static int msr_init_perf(struct amd_cpudata *cpudata) if (ret) return ret; + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req); + if (ret) + return ret; + + WRITE_ONCE(cpudata->cppc_req_cached, cppc_req); + min_perf = FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cppc_req); + + /* + * Clear out the min_perf part to check if the rest of the MSR is 0, if yes, this is an + * indication that the min_perf value is the one specified through the BIOS option + */ + cppc_req &= ~(AMD_CPPC_MIN_PERF_MASK); + + if (!cppc_req) + perf.bios_min_perf = min_perf; + perf.highest_perf = numerator; perf.max_limit_perf = numerator; perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); @@ -492,23 +477,56 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) return static_call(amd_pstate_init_perf)(cpudata); } -static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, +static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { + struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; + u64 value, prev; + int ret; if (cppc_state == AMD_PSTATE_ACTIVE) { - int ret = shmem_set_epp(cpudata, epp); + int ret = shmem_set_epp(policy, epp); if (ret) return ret; } + value = prev = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | + AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = READ_ONCE(cpudata->perf); + + trace_amd_pstate_epp_perf(cpudata->cpu, + perf.highest_perf, + epp, + min_perf, + max_perf, + policy->boost_enabled, + value != prev); + } + + if (value == prev) + return 0; + perf_ctrls.max_perf = max_perf; perf_ctrls.min_perf = min_perf; perf_ctrls.desired_perf = des_perf; - return cppc_set_perf(cpudata->cpu, &perf_ctrls); + ret = cppc_set_perf(cpudata->cpu, &perf_ctrls); + if (ret) + return ret; + + WRITE_ONCE(cpudata->cppc_req_cached, value); + + return 0; } static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) @@ -553,6 +571,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, if (!policy) return; + /* limit the max perf when core performance boost feature is disabled */ + if (!cpudata->boost_supported) + max_perf = min_t(u8, perf.nominal_perf, max_perf); + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf); @@ -562,41 +584,42 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, des_perf = 0; } - /* limit the max perf when core performance boost feature is disabled */ - if (!cpudata->boost_supported) - max_perf = min_t(u8, perf.nominal_perf, max_perf); - if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc, cpudata->cpu, fast_switch); } - amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); + amd_pstate_update_perf(policy, min_perf, des_perf, max_perf, 0, fast_switch); } static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) { /* * Initialize lower frequency limit (i.e.policy->min) with - * lowest_nonlinear_frequency which is the most energy efficient - * frequency. Override the initial value set by cpufreq core and - * amd-pstate qos_requests. + * lowest_nonlinear_frequency or the min frequency (if) specified in BIOS, + * Override the initial value set by cpufreq core and amd-pstate qos_requests. */ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(policy_data->cpu); struct amd_cpudata *cpudata; + union perf_cached perf; if (!policy) return -EINVAL; cpudata = policy->driver_data; - policy_data->min = cpudata->lowest_nonlinear_freq; + perf = READ_ONCE(cpudata->perf); + + if (perf.bios_min_perf) + policy_data->min = perf_to_freq(perf, cpudata->nominal_freq, + perf.bios_min_perf); + else + policy_data->min = cpudata->lowest_nonlinear_freq; } cpufreq_verify_within_cpu_limits(policy_data); - pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min); return 0; } @@ -607,13 +630,16 @@ static void amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) union perf_cached perf = READ_ONCE(cpudata->perf); perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max); - perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); + WRITE_ONCE(cpudata->max_limit_freq, policy->max); - if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf); + WRITE_ONCE(cpudata->min_limit_freq, min(cpudata->nominal_freq, cpudata->max_limit_freq)); + } else { + perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); + WRITE_ONCE(cpudata->min_limit_freq, policy->min); + } - WRITE_ONCE(cpudata->max_limit_freq, policy->max); - WRITE_ONCE(cpudata->min_limit_freq, policy->min); WRITE_ONCE(cpudata->perf, perf); } @@ -791,16 +817,6 @@ static void amd_perf_ctl_reset(unsigned int cpu) wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); } -/* - * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks - * due to locking, so queue the work for later. - */ -static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) -{ - sched_set_itmt_support(); -} -static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); - #define CPPC_MAX_PERF U8_MAX static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) @@ -811,29 +827,20 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) cpudata->hw_prefcore = true; - /* - * The priorities can be set regardless of whether or not - * sched_set_itmt_support(true) has been called and it is valid to - * update them at any time after it has been called. - */ + /* Priorities must be initialized before ITMT support can be toggled on. */ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->prefcore_ranking), cpudata->cpu); - - schedule_work(&sched_prefcore_work); } -static void amd_pstate_update_limits(unsigned int cpu) +static void amd_pstate_update_limits(struct cpufreq_policy *policy) { - struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; u32 prev_high = 0, cur_high = 0; bool highest_perf_changed = false; + unsigned int cpu = policy->cpu; if (!amd_pstate_prefcore) return; - if (!policy) - return; - if (amd_get_highest_perf(cpu, &cur_high)) return; @@ -844,8 +851,10 @@ static void amd_pstate_update_limits(unsigned int cpu) if (highest_perf_changed) { WRITE_ONCE(cpudata->prefcore_ranking, cur_high); - if (cur_high < CPPC_MAX_PERF) + if (cur_high < CPPC_MAX_PERF) { sched_set_itmt_core_prio((int)cur_high, cpu); + sched_update_asym_prefer_cpu(cpu, prev_high, cur_high); + } } } @@ -994,6 +1003,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) perf.highest_perf); policy->boost_supported = READ_ONCE(cpudata->boost_supported); + ret = amd_pstate_cppc_enable(policy); + if (ret) + goto free_cpudata1; + /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; @@ -1033,6 +1046,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + + /* Reset CPPC_REQ MSR to the BIOS value */ + amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); freq_qos_remove_request(&cpudata->req[1]); freq_qos_remove_request(&cpudata->req[0]); @@ -1040,28 +1057,6 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) kfree(cpudata); } -static int amd_pstate_cpu_resume(struct cpufreq_policy *policy) -{ - int ret; - - ret = amd_pstate_cppc_enable(true); - if (ret) - pr_err("failed to enable amd-pstate during resume, return %d\n", ret); - - return ret; -} - -static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) -{ - int ret; - - ret = amd_pstate_cppc_enable(false); - if (ret) - pr_err("failed to disable amd-pstate during suspend, return %d\n", ret); - - return ret; -} - /* Sysfs attributes */ /* @@ -1153,8 +1148,10 @@ static ssize_t show_energy_performance_available_preferences( static ssize_t store_energy_performance_preference( struct cpufreq_policy *policy, const char *buf, size_t count) { + struct amd_cpudata *cpudata = policy->driver_data; char str_preference[21]; ssize_t ret; + u8 epp; ret = sscanf(buf, "%20s", str_preference); if (ret != 1) @@ -1164,7 +1161,17 @@ static ssize_t store_energy_performance_preference( if (ret < 0) return -EINVAL; - ret = amd_pstate_set_energy_pref_index(policy, ret); + if (!ret) + epp = cpudata->epp_default; + else + epp = epp_values[ret]; + + if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("EPP cannot be set under performance policy\n"); + return -EBUSY; + } + + ret = amd_pstate_set_epp(policy, epp); return ret ? ret : count; } @@ -1173,9 +1180,11 @@ static ssize_t show_energy_performance_preference( struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; - u8 preference; + u8 preference, epp; - switch (cpudata->epp_cached) { + epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + + switch (epp) { case AMD_CPPC_EPP_PERFORMANCE: preference = EPP_INDEX_PERFORMANCE; break; @@ -1197,7 +1206,9 @@ static ssize_t show_energy_performance_preference( static void amd_pstate_driver_cleanup(void) { - amd_pstate_cppc_enable(false); + if (amd_pstate_prefcore) + sched_clear_itmt_support(); + cppc_state = AMD_PSTATE_DISABLE; current_pstate_driver = NULL; } @@ -1231,14 +1242,6 @@ static int amd_pstate_register_driver(int mode) cppc_state = mode; - ret = amd_pstate_cppc_enable(true); - if (ret) { - pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", - ret); - amd_pstate_driver_cleanup(); - return ret; - } - /* at least one CPU supports CPB */ current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); @@ -1248,6 +1251,10 @@ static int amd_pstate_register_driver(int mode) return ret; } + /* Enable ITMT support once all CPUs have initialized their asym priorities. */ + if (amd_pstate_prefcore) + sched_set_itmt_support(); + return 0; } @@ -1438,7 +1445,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) struct amd_cpudata *cpudata; union perf_cached perf; struct device *dev; - u64 value; int ret; /* @@ -1478,11 +1484,15 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); + policy->driver_data = cpudata; + + ret = amd_pstate_cppc_enable(policy); + if (ret) + goto free_cpudata1; /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - policy->driver_data = cpudata; policy->boost_supported = READ_ONCE(cpudata->boost_supported); @@ -1499,13 +1509,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; } - if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); - if (ret) - return ret; - WRITE_ONCE(cpudata->cppc_req_cached, value); - } - ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); + ret = amd_pstate_set_epp(policy, cpudata->epp_default); if (ret) return ret; @@ -1524,6 +1528,11 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; if (cpudata) { + union perf_cached perf = READ_ONCE(cpudata->perf); + + /* Reset CPPC_REQ MSR to the BIOS value */ + amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + kfree(cpudata); policy->driver_data = NULL; } @@ -1543,17 +1552,11 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) epp = 0; else - epp = READ_ONCE(cpudata->epp_cached); + epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); perf = READ_ONCE(cpudata->perf); - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, - perf.min_limit_perf, - perf.max_limit_perf, - policy->boost_enabled); - } - return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U, + return amd_pstate_update_perf(policy, perf.min_limit_perf, 0U, perf.max_limit_perf, epp, false); } @@ -1565,9 +1568,6 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) if (!policy->cpuinfo.max_freq) return -ENODEV; - pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", - policy->cpuinfo.max_freq, policy->max); - cpudata->policy = policy->policy; ret = amd_pstate_epp_update_limit(policy); @@ -1583,68 +1583,38 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) return 0; } -static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) +static int amd_pstate_cpu_online(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - union perf_cached perf = READ_ONCE(cpudata->perf); - int ret; - - ret = amd_pstate_cppc_enable(true); - if (ret) - pr_err("failed to enable amd pstate during resume, return %d\n", ret); - - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - cpudata->epp_cached, - FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - perf.highest_perf, policy->boost_enabled); - } - - return amd_pstate_epp_update_limit(policy); + return amd_pstate_cppc_enable(policy); } -static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) -{ - struct amd_cpudata *cpudata = policy->driver_data; - int ret; - - pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); - - ret = amd_pstate_epp_reenable(policy); - if (ret) - return ret; - cpudata->suspended = false; - - return 0; -} - -static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +static int amd_pstate_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf = READ_ONCE(cpudata->perf); - if (cpudata->suspended) - return 0; - - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - AMD_CPPC_EPP_BALANCE_POWERSAVE, - perf.lowest_perf, perf.lowest_perf, - policy->boost_enabled); - } - - return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf, - AMD_CPPC_EPP_BALANCE_POWERSAVE, false); + /* + * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified + * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the + * limits, epp and desired perf will get reset to the cached values in cpudata struct + */ + return amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); } -static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) +static int amd_pstate_suspend(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); int ret; - /* avoid suspending when EPP is not enabled */ - if (cppc_state != AMD_PSTATE_ACTIVE) - return 0; + /* + * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified + * min_perf value across kexec reboots. If this CPU is just resumed back without kexec, + * the limits, epp and desired perf will get reset to the cached values in cpudata struct + */ + ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + if (ret) + return ret; /* invalidate to ensure it's rewritten during resume */ cpudata->cppc_req_cached = 0; @@ -1652,21 +1622,31 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) /* set this flag to avoid setting core offline*/ cpudata->suspended = true; - /* disable CPPC in lowlevel firmware */ - ret = amd_pstate_cppc_enable(false); - if (ret) - pr_err("failed to suspend, return %d\n", ret); - return 0; } +static int amd_pstate_resume(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); + int cur_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->cur); + + /* Set CPPC_REQ to last sane value until the governor updates it */ + return amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, + 0U, false); +} + static int amd_pstate_epp_resume(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->suspended) { + int ret; + /* enable amd pstate from suspend state*/ - amd_pstate_epp_reenable(policy); + ret = amd_pstate_epp_update_limit(policy); + if (ret) + return ret; cpudata->suspended = false; } @@ -1681,8 +1661,10 @@ static struct cpufreq_driver amd_pstate_driver = { .fast_switch = amd_pstate_fast_switch, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, - .suspend = amd_pstate_cpu_suspend, - .resume = amd_pstate_cpu_resume, + .online = amd_pstate_cpu_online, + .offline = amd_pstate_cpu_offline, + .suspend = amd_pstate_suspend, + .resume = amd_pstate_resume, .set_boost = amd_pstate_set_boost, .update_limits = amd_pstate_update_limits, .name = "amd-pstate", @@ -1695,9 +1677,9 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .setpolicy = amd_pstate_epp_set_policy, .init = amd_pstate_epp_cpu_init, .exit = amd_pstate_epp_cpu_exit, - .offline = amd_pstate_epp_cpu_offline, - .online = amd_pstate_epp_cpu_online, - .suspend = amd_pstate_epp_suspend, + .offline = amd_pstate_cpu_offline, + .online = amd_pstate_cpu_online, + .suspend = amd_pstate_suspend, .resume = amd_pstate_epp_resume, .update_limits = amd_pstate_update_limits, .set_boost = amd_pstate_set_boost, @@ -1849,7 +1831,6 @@ static int __init amd_pstate_init(void) global_attr_free: cpufreq_unregister_driver(current_pstate_driver); - amd_pstate_cppc_enable(false); return ret; } device_initcall(amd_pstate_init); diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 1557e1afea79cdf6d2db8d77c133b0f871568d6d..2f7ae364d3313393c2ca853f911f0b9aa26a5a09 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -30,6 +30,7 @@ * @lowest_perf: the absolute lowest performance level of the processor * @min_limit_perf: Cached value of the performance corresponding to policy->min * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @bios_min_perf: Cached perf value corresponding to the "Requested CPU Min Frequency" BIOS option */ union perf_cached { struct { @@ -39,6 +40,7 @@ union perf_cached { u8 lowest_perf; u8 min_limit_perf; u8 max_limit_perf; + u8 bios_min_perf; }; u64 val; }; @@ -102,7 +104,6 @@ struct amd_cpudata { bool hw_prefcore; /* EPP feature related attributes*/ - u8 epp_cached; u32 policy; bool suspended; u8 epp_default; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index cc98d8cf54330348e173203e4be902757edaa255..c6d681ae2f974a72c365d0212eebf3a50618a44c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1010,17 +1010,16 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret = -EBUSY; if (!fattr->show) return -EIO; - down_read(&policy->rwsem); + guard(cpufreq_policy_read)(policy); + if (likely(!policy_is_inactive(policy))) - ret = fattr->show(policy, buf); - up_read(&policy->rwsem); + return fattr->show(policy, buf); - return ret; + return -EBUSY; } static ssize_t store(struct kobject *kobj, struct attribute *attr, @@ -1028,17 +1027,16 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret = -EBUSY; if (!fattr->store) return -EIO; - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); + if (likely(!policy_is_inactive(policy))) - ret = fattr->store(policy, buf, count); - up_write(&policy->rwsem); + return fattr->store(policy, buf, count); - return ret; + return -EBUSY; } static void cpufreq_sysfs_release(struct kobject *kobj) @@ -1175,7 +1173,8 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp if (cpumask_test_cpu(cpu, policy->cpus)) return 0; - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); + if (has_target()) cpufreq_stop_governor(policy); @@ -1186,7 +1185,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp if (ret) pr_err("%s: Failed to start governor\n", __func__); } - up_write(&policy->rwsem); + return ret; } @@ -1206,9 +1205,10 @@ static void handle_update(struct work_struct *work) container_of(work, struct cpufreq_policy, update); pr_debug("handle_update for cpu %u called\n", policy->cpu); - down_write(&policy->rwsem); + + guard(cpufreq_policy_write)(policy); + refresh_frequency_limits(policy); - up_write(&policy->rwsem); } static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq, @@ -1234,11 +1234,11 @@ static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy) struct kobject *kobj; struct completion *cmp; - down_write(&policy->rwsem); - cpufreq_stats_free_table(policy); - kobj = &policy->kobj; - cmp = &policy->kobj_unregister; - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + cpufreq_stats_free_table(policy); + kobj = &policy->kobj; + cmp = &policy->kobj_unregister; + } kobject_put(kobj); /* @@ -1315,7 +1315,6 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) init_waitqueue_head(&policy->transition_wait); INIT_WORK(&policy->update, handle_update); - policy->cpu = cpu; return policy; err_min_qos_notifier: @@ -1384,35 +1383,17 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) kfree(policy); } -static int cpufreq_online(unsigned int cpu) +static int cpufreq_policy_online(struct cpufreq_policy *policy, + unsigned int cpu, bool new_policy) { - struct cpufreq_policy *policy; - bool new_policy; unsigned long flags; unsigned int j; int ret; - pr_debug("%s: bringing CPU%u online\n", __func__, cpu); - - /* Check if this CPU already has a policy to manage it */ - policy = per_cpu(cpufreq_cpu_data, cpu); - if (policy) { - WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus)); - if (!policy_is_inactive(policy)) - return cpufreq_add_policy_cpu(policy, cpu); + guard(cpufreq_policy_write)(policy); - /* This is the only online CPU for the policy. Start over. */ - new_policy = false; - down_write(&policy->rwsem); - policy->cpu = cpu; - policy->governor = NULL; - } else { - new_policy = true; - policy = cpufreq_policy_alloc(cpu); - if (!policy) - return -ENOMEM; - down_write(&policy->rwsem); - } + policy->cpu = cpu; + policy->governor = NULL; if (!new_policy && cpufreq_driver->online) { /* Recover policy->cpus using related_cpus */ @@ -1435,7 +1416,7 @@ static int cpufreq_online(unsigned int cpu) if (ret) { pr_debug("%s: %d: initialization failed\n", __func__, __LINE__); - goto out_free_policy; + goto out_clear_policy; } /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ @@ -1586,20 +1567,6 @@ static int cpufreq_online(unsigned int cpu) goto out_destroy_policy; } - up_write(&policy->rwsem); - - kobject_uevent(&policy->kobj, KOBJ_ADD); - - /* Callback for handling stuff after policy is ready */ - if (cpufreq_driver->ready) - cpufreq_driver->ready(policy); - - /* Register cpufreq cooling only for a new policy */ - if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) - policy->cdev = of_cpufreq_cooling_register(policy); - - pr_debug("initialization complete\n"); - return 0; out_destroy_policy: @@ -1614,14 +1581,57 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_driver->exit) cpufreq_driver->exit(policy); -out_free_policy: +out_clear_policy: cpumask_clear(policy->cpus); - up_write(&policy->rwsem); - cpufreq_policy_free(policy); return ret; } +static int cpufreq_online(unsigned int cpu) +{ + struct cpufreq_policy *policy; + bool new_policy; + int ret; + + pr_debug("%s: bringing CPU%u online\n", __func__, cpu); + + /* Check if this CPU already has a policy to manage it */ + policy = per_cpu(cpufreq_cpu_data, cpu); + if (policy) { + WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus)); + if (!policy_is_inactive(policy)) + return cpufreq_add_policy_cpu(policy, cpu); + + /* This is the only online CPU for the policy. Start over. */ + new_policy = false; + } else { + new_policy = true; + policy = cpufreq_policy_alloc(cpu); + if (!policy) + return -ENOMEM; + } + + ret = cpufreq_policy_online(policy, cpu, new_policy); + if (ret) { + cpufreq_policy_free(policy); + return ret; + } + + kobject_uevent(&policy->kobj, KOBJ_ADD); + + /* Callback for handling stuff after policy is ready */ + if (cpufreq_driver->ready) + cpufreq_driver->ready(policy); + + /* Register cpufreq cooling only for a new policy */ + if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) + policy->cdev = of_cpufreq_cooling_register(policy); + + pr_debug("initialization complete\n"); + + return 0; +} + /** * cpufreq_add_dev - the cpufreq interface for a CPU device. * @dev: CPU device. @@ -1709,11 +1719,10 @@ static int cpufreq_offline(unsigned int cpu) return 0; } - down_write(&policy->rwsem); + guard(cpufreq_policy_write)(policy); __cpufreq_offline(cpu, policy); - up_write(&policy->rwsem); return 0; } @@ -1730,33 +1739,29 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) if (!policy) return; - down_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + if (cpu_online(cpu)) + __cpufreq_offline(cpu, policy); - if (cpu_online(cpu)) - __cpufreq_offline(cpu, policy); + remove_cpu_dev_symlink(policy, cpu, dev); - remove_cpu_dev_symlink(policy, cpu, dev); + if (!cpumask_empty(policy->real_cpus)) + return; - if (!cpumask_empty(policy->real_cpus)) { - up_write(&policy->rwsem); - return; - } + /* + * Unregister cpufreq cooling once all the CPUs of the policy + * are removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } - /* - * Unregister cpufreq cooling once all the CPUs of the policy are - * removed. - */ - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; + /* We did light-weight exit earlier, do full tear down now */ + if (cpufreq_driver->offline && cpufreq_driver->exit) + cpufreq_driver->exit(policy); } - /* We did light-weight exit earlier, do full tear down now */ - if (cpufreq_driver->offline && cpufreq_driver->exit) - cpufreq_driver->exit(policy); - - up_write(&policy->rwsem); - cpufreq_policy_free(policy); } @@ -1909,15 +1914,16 @@ unsigned int cpufreq_get(unsigned int cpu) struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); unsigned int ret_freq = 0; - if (policy) { - down_read(&policy->rwsem); + if (!policy) + return 0; + + scoped_guard(cpufreq_policy_read, policy) { if (cpufreq_driver->get) ret_freq = __cpufreq_get(policy); - up_read(&policy->rwsem); - - cpufreq_cpu_put(policy); } + cpufreq_cpu_put(policy); + return ret_freq; } EXPORT_SYMBOL(cpufreq_get); @@ -1977,9 +1983,9 @@ void cpufreq_suspend(void) for_each_active_policy(policy) { if (has_target()) { - down_write(&policy->rwsem); - cpufreq_stop_governor(policy); - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + cpufreq_stop_governor(policy); + } } if (cpufreq_driver->suspend && cpufreq_driver->suspend(policy)) @@ -2020,9 +2026,9 @@ void cpufreq_resume(void) pr_err("%s: Failed to resume driver: %s\n", __func__, cpufreq_driver->name); } else if (has_target()) { - down_write(&policy->rwsem); - ret = cpufreq_start_governor(policy); - up_write(&policy->rwsem); + scoped_guard(cpufreq_policy_write, policy) { + ret = cpufreq_start_governor(policy); + } if (ret) pr_err("%s: Failed to start governor for CPU%u's policy\n", @@ -2390,15 +2396,9 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - int ret; - - down_write(&policy->rwsem); - - ret = __cpufreq_driver_target(policy, target_freq, relation); + guard(cpufreq_policy_write)(policy); - up_write(&policy->rwsem); - - return ret; + return __cpufreq_driver_target(policy, target_freq, relation); } EXPORT_SYMBOL_GPL(cpufreq_driver_target); @@ -2757,7 +2757,7 @@ void cpufreq_update_limits(unsigned int cpu) return; if (cpufreq_driver->update_limits) - cpufreq_driver->update_limits(cpu); + cpufreq_driver->update_limits(policy); else cpufreq_update_policy(cpu); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 909359b717f341b3f0096369823b7a8532c1d366..e6888fb57afe32aa01777da403d8385a3f65016e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -215,6 +216,7 @@ struct global_params { * @hwp_req_cached: Cached value of the last HWP Request MSR * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR * @last_io_update: Last time when IO wake flag was set + * @capacity_perf: Highest perf used for scale invariance * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * @suspended: Whether or not the driver has been suspended. @@ -253,6 +255,7 @@ struct cpudata { u64 hwp_req_cached; u64 hwp_cap_cached; u64 last_io_update; + unsigned int capacity_perf; unsigned int sched_flags; u32 hwp_boost_min; bool suspended; @@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init; static bool per_cpu_limits __ro_after_init; static bool hwp_forced __ro_after_init; static bool hwp_boost __read_mostly; +static bool hwp_is_hybrid; static struct cpufreq_driver *intel_pstate_driver __read_mostly; @@ -930,6 +934,139 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { NULL, }; +static struct cpudata *hybrid_max_perf_cpu __read_mostly; +/* + * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata, + * and the x86 arch scale-invariance information from concurrent updates. + */ +static DEFINE_MUTEX(hybrid_capacity_lock); + +static void hybrid_set_cpu_capacity(struct cpudata *cpu) +{ + arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf, + hybrid_max_perf_cpu->capacity_perf, + cpu->capacity_perf, + cpu->pstate.max_pstate_physical); + + pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu, + cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, + cpu->pstate.max_pstate_physical); +} + +static void hybrid_clear_cpu_capacity(unsigned int cpunum) +{ + arch_set_cpu_capacity(cpunum, 1, 1, 1, 1); +} + +static void hybrid_get_capacity_perf(struct cpudata *cpu) +{ + if (READ_ONCE(global.no_turbo)) { + cpu->capacity_perf = cpu->pstate.max_pstate_physical; + return; + } + + cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); +} + +static void hybrid_set_capacity_of_cpus(void) +{ + int cpunum; + + for_each_online_cpu(cpunum) { + struct cpudata *cpu = all_cpu_data[cpunum]; + + if (cpu) + hybrid_set_cpu_capacity(cpu); + } +} + +static void hybrid_update_cpu_capacity_scaling(void) +{ + struct cpudata *max_perf_cpu = NULL; + unsigned int max_cap_perf = 0; + int cpunum; + + for_each_online_cpu(cpunum) { + struct cpudata *cpu = all_cpu_data[cpunum]; + + if (!cpu) + continue; + + /* + * During initialization, CPU performance at full capacity needs + * to be determined. + */ + if (!hybrid_max_perf_cpu) + hybrid_get_capacity_perf(cpu); + + /* + * If hybrid_max_perf_cpu is not NULL at this point, it is + * being replaced, so don't take it into account when looking + * for the new one. + */ + if (cpu == hybrid_max_perf_cpu) + continue; + + if (cpu->capacity_perf > max_cap_perf) { + max_cap_perf = cpu->capacity_perf; + max_perf_cpu = cpu; + } + } + + if (max_perf_cpu) { + hybrid_max_perf_cpu = max_perf_cpu; + hybrid_set_capacity_of_cpus(); + } else { + pr_info("Found no CPUs with nonzero maximum performance\n"); + /* Revert to the flat CPU capacity structure. */ + for_each_online_cpu(cpunum) + hybrid_clear_cpu_capacity(cpunum); + } +} + +static void __hybrid_init_cpu_capacity_scaling(void) +{ + hybrid_max_perf_cpu = NULL; + hybrid_update_cpu_capacity_scaling(); +} + +static void hybrid_init_cpu_capacity_scaling(void) +{ + bool disable_itmt = false; + + mutex_lock(&hybrid_capacity_lock); + + /* + * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity + * scaling has been enabled already and the driver is just changing the + * operation mode. + */ + if (hybrid_max_perf_cpu) { + __hybrid_init_cpu_capacity_scaling(); + goto unlock; + } + + /* + * On hybrid systems, use asym capacity instead of ITMT, but because + * the capacity of SMT threads is not deterministic even approximately, + * do not do that when SMT is in use. + */ + if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) { + __hybrid_init_cpu_capacity_scaling(); + disable_itmt = true; + } + +unlock: + mutex_unlock(&hybrid_capacity_lock); + + /* + * Disabling ITMT causes sched domains to be rebuilt to disable asym + * packing and enable asym capacity. + */ + if (disable_itmt) + sched_clear_itmt_support(); +} + static void __intel_pstate_get_hwp_cap(struct cpudata *cpu) { u64 cap; @@ -958,6 +1095,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu) } } +static void hybrid_update_capacity(struct cpudata *cpu) +{ + unsigned int max_cap_perf; + + mutex_lock(&hybrid_capacity_lock); + + if (!hybrid_max_perf_cpu) + goto unlock; + + /* + * The maximum performance of the CPU may have changed, but assume + * that the performance of the other CPUs has not changed. + */ + max_cap_perf = hybrid_max_perf_cpu->capacity_perf; + + intel_pstate_get_hwp_cap(cpu); + + hybrid_get_capacity_perf(cpu); + /* Should hybrid_max_perf_cpu be replaced by this CPU? */ + if (cpu->capacity_perf > max_cap_perf) { + hybrid_max_perf_cpu = cpu; + hybrid_set_capacity_of_cpus(); + goto unlock; + } + + /* If this CPU is hybrid_max_perf_cpu, should it be replaced? */ + if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) { + hybrid_update_cpu_capacity_scaling(); + goto unlock; + } + + hybrid_set_cpu_capacity(cpu); + +unlock: + mutex_unlock(&hybrid_capacity_lock); +} + static void intel_pstate_hwp_set(unsigned int cpu) { struct cpudata *cpu_data = all_cpu_data[cpu]; @@ -1066,6 +1240,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu) value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); + + mutex_lock(&hybrid_capacity_lock); + + if (!hybrid_max_perf_cpu) { + mutex_unlock(&hybrid_capacity_lock); + + return; + } + + if (hybrid_max_perf_cpu == cpu) + hybrid_update_cpu_capacity_scaling(); + + mutex_unlock(&hybrid_capacity_lock); + + /* Reset the capacity of the CPU going offline to the initial value. */ + hybrid_clear_cpu_capacity(cpu->cpu); } #define POWER_CTL_EE_ENABLE 1 @@ -1146,9 +1336,11 @@ static void intel_pstate_update_policies(void) cpufreq_update_policy(cpu); } -static void __intel_pstate_update_max_freq(struct cpudata *cpudata, - struct cpufreq_policy *policy) +static void __intel_pstate_update_max_freq(struct cpufreq_policy *policy, + struct cpudata *cpudata) { + guard(cpufreq_policy_write)(policy); + if (hwp_active) intel_pstate_get_hwp_cap(cpudata); @@ -1158,16 +1350,26 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata, refresh_frequency_limits(policy); } -static void intel_pstate_update_limits(unsigned int cpu) +static bool intel_pstate_update_max_freq(struct cpudata *cpudata) { - struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); + struct cpufreq_policy *policy __free(put_cpufreq_policy); + policy = cpufreq_cpu_get(cpudata->cpu); if (!policy) - return; + return false; - __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); + __intel_pstate_update_max_freq(policy, cpudata); - cpufreq_cpu_release(policy); + return true; +} + +static void intel_pstate_update_limits(struct cpufreq_policy *policy) +{ + struct cpudata *cpudata = all_cpu_data[policy->cpu]; + + __intel_pstate_update_max_freq(policy, cpudata); + + hybrid_update_capacity(cpudata); } static void intel_pstate_update_limits_for_all(void) @@ -1175,7 +1377,14 @@ static void intel_pstate_update_limits_for_all(void) int cpu; for_each_possible_cpu(cpu) - intel_pstate_update_limits(cpu); + intel_pstate_update_max_freq(all_cpu_data[cpu]); + + mutex_lock(&hybrid_capacity_lock); + + if (hybrid_max_perf_cpu) + __hybrid_init_cpu_capacity_scaling(); + + mutex_unlock(&hybrid_capacity_lock); } /************************** sysfs begin ************************/ @@ -1608,12 +1817,14 @@ static void intel_pstate_notify_work(struct work_struct *work) { struct cpudata *cpudata = container_of(to_delayed_work(work), struct cpudata, hwp_notify_work); - struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); - - if (policy) { - __intel_pstate_update_max_freq(cpudata, policy); - cpufreq_cpu_release(policy); + if (intel_pstate_update_max_freq(cpudata)) { + /* + * The driver will not be unregistered while this function is + * running, so update the capacity without acquiring the driver + * lock. + */ + hybrid_update_capacity(cpudata); } wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); @@ -2030,8 +2241,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (pstate_funcs.get_cpu_scaling) { cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu); - if (cpu->pstate.scaling != perf_ctl_scaling) + if (cpu->pstate.scaling != perf_ctl_scaling) { intel_pstate_hybrid_hwp_adjust(cpu); + hwp_is_hybrid = true; + } } else { cpu->pstate.scaling = perf_ctl_scaling; } @@ -2707,6 +2920,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy) */ intel_pstate_hwp_reenable(cpu); cpu->suspended = false; + + hybrid_update_capacity(cpu); } return 0; @@ -3147,6 +3362,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver) global.min_perf_pct = min_perf_pct_min(); + hybrid_init_cpu_capacity_scaling(); + return 0; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 202b3185d923e46d5acb47c31c9773203cda43e2..080ddc3187e1abc85fdf108aba1b3fbff35d0ae9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -170,6 +170,12 @@ struct cpufreq_policy { struct notifier_block nb_max; }; +DEFINE_GUARD(cpufreq_policy_write, struct cpufreq_policy *, + down_write(&_T->rwsem), up_write(&_T->rwsem)) + +DEFINE_GUARD(cpufreq_policy_read, struct cpufreq_policy *, + down_read(&_T->rwsem), up_read(&_T->rwsem)) + /* * Used for passing new cpufreq policy data to the cpufreq driver's ->verify() * callback for sanitization. That callback is only expected to modify the min @@ -384,7 +390,7 @@ struct cpufreq_driver { unsigned int (*get)(unsigned int cpu); /* Called to update policy limits on firmware notifications. */ - void (*update_limits)(unsigned int cpu); + void (*update_limits)(struct cpufreq_policy *policy); /* optional */ int (*bios_limit)(int cpu, unsigned int *limit); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 336d8bbd5c17020719abe499ebeea863a00c6b0d..7ce562c7fd64990316699ea76b9855e3145da2e5 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -216,6 +216,8 @@ struct sched_domain_topology_level { }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type @@ -249,6 +251,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) return true; } +static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c2bd6e3e78078291d31d30e7669f067ac1043167..3ee0ec5ce03d2f6122a65b228106a86b7d7af7dd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1327,6 +1327,64 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) update_group_capacity(sd, cpu); } +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ + /* * Asymmetric CPU capacity bits */