Re: [PATCH] x86: Optimize load_mm_cr4 to load_mm_cr4_irqsoff

From: Jan Kiszka
Date: Tue Jul 23 2019 - 14:52:56 EST


On 18.06.19 09:32, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
>
> We only call load_mm_cr4 with interrupts disabled:
> - switch_mm_irqs_off
> - refresh_pce (on_each_cpu callback)
>
> Thus, we can avoid disabling interrupts again in cr4_set/clear_bits.
>
> Instead, provide cr4_set/clear_bits_irqsoffs and call those helpers from
> load_mm_cr4_irqsoff. The renaming in combination with the checks
> in __cr4_set shall ensure that any changes in the boundary conditions of
> the invocations will be detected.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> ---
>
> Found while porting Xenomai with its virtualized interrupt
> infrastructure to a newer kernel.
>
> arch/x86/events/core.c | 2 +-
> arch/x86/include/asm/mmu_context.h | 8 ++++----
> arch/x86/include/asm/tlbflush.h | 30 +++++++++++++++++++++++-------
> arch/x86/mm/tlb.c | 2 +-
> 4 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index f315425d8468..78a3fba28c62 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2161,7 +2161,7 @@ static int x86_pmu_event_init(struct perf_event *event)
>
> static void refresh_pce(void *ignored)
> {
> - load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
> + load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm));
> }
>
> static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 9024236693d2..16ae821483c8 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -28,16 +28,16 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
>
> DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
>
> -static inline void load_mm_cr4(struct mm_struct *mm)
> +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm)
> {
> if (static_branch_unlikely(&rdpmc_always_available_key) ||
> atomic_read(&mm->context.perf_rdpmc_allowed))
> - cr4_set_bits(X86_CR4_PCE);
> + cr4_set_bits_irqsoff(X86_CR4_PCE);
> else
> - cr4_clear_bits(X86_CR4_PCE);
> + cr4_clear_bits_irqsoff(X86_CR4_PCE);
> }
> #else
> -static inline void load_mm_cr4(struct mm_struct *mm) {}
> +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {}
> #endif
>
> #ifdef CONFIG_MODIFY_LDT_SYSCALL
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index dee375831962..6f66d841262d 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -290,26 +290,42 @@ static inline void __cr4_set(unsigned long cr4)
> }
>
> /* Set in this cpu's CR4. */
> -static inline void cr4_set_bits(unsigned long mask)
> +static inline void cr4_set_bits_irqsoff(unsigned long mask)
> {
> - unsigned long cr4, flags;
> + unsigned long cr4;
>
> - local_irq_save(flags);
> cr4 = this_cpu_read(cpu_tlbstate.cr4);
> if ((cr4 | mask) != cr4)
> __cr4_set(cr4 | mask);
> - local_irq_restore(flags);
> }
>
> /* Clear in this cpu's CR4. */
> -static inline void cr4_clear_bits(unsigned long mask)
> +static inline void cr4_clear_bits_irqsoff(unsigned long mask)
> {
> - unsigned long cr4, flags;
> + unsigned long cr4;
>
> - local_irq_save(flags);
> cr4 = this_cpu_read(cpu_tlbstate.cr4);
> if ((cr4 & ~mask) != cr4)
> __cr4_set(cr4 & ~mask);
> +}
> +
> +/* Set in this cpu's CR4. */
> +static inline void cr4_set_bits(unsigned long mask)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + cr4_set_bits_irqsoff(mask);
> + local_irq_restore(flags);
> +}
> +
> +/* Clear in this cpu's CR4. */
> +static inline void cr4_clear_bits(unsigned long mask)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + cr4_clear_bits_irqsoff(mask);
> local_irq_restore(flags);
> }
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 91f6db92554c..8fc1eaa55b6e 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
>
> if (next != real_prev) {
> - load_mm_cr4(next);
> + load_mm_cr4_irqsoff(next);
> switch_ldt(real_prev, next);
> }
> }
>

Ping. I think the only remark of Dave was answered.

Jan

--
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux