[RFC PATCH 1/7] x86/entry: Add support for early task context tracking
From: Daniel Bristot de Oliveira
Date: Tue Apr 02 2019 - 16:04:17 EST
Currently, the identification of the context is made through the
preempt_counter, but it is set after the execution of the first functions
of the IRQ/NMI, causing potential problems in the identification of the
current status. For instance, ftrace/perf might drop events in the early
stage of IRQ/NMI handlers because the preempt_counter was not set.
The proposed approach is to use a dedicated per-cpu variable to keep
track of the context of execution, with values set before the execution
of the first C function of the interrupt handler.
This is a PoC in the x86_64.
Signed-off-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: "Joel Fernandes (Google)" <joel@xxxxxxxxxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: Tommaso Cucinotta <tommaso.cucinotta@xxxxxxxxxxxxxxx>
Cc: Romulo Silva de Oliveira <romulo.deoliveira@xxxxxxx>
Cc: Clark Williams <williams@xxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Cc: x86@xxxxxxxxxx
---
arch/x86/entry/entry_64.S | 9 +++++++++
arch/x86/include/asm/irqflags.h | 30 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/common.c | 4 ++++
include/linux/irqflags.h | 4 ++++
kernel/softirq.c | 5 ++++-
5 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1f0efdb7b629..1471b544241f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -545,6 +545,7 @@ ENTRY(interrupt_entry)
testb $3, CS+8(%rsp)
jz 1f
+ TASK_CONTEXT_SET_BIT context=TASK_CTX_IRQ
/*
* IRQ from user mode.
*
@@ -561,6 +562,8 @@ ENTRY(interrupt_entry)
1:
ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
+
+ TASK_CONTEXT_SET_BIT context=TASK_CTX_IRQ
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -586,6 +589,7 @@ ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
+ TASK_CONTEXT_RESET_BIT context=TASK_CTX_IRQ
LEAVE_IRQ_STACK
testb $3, CS(%rsp)
@@ -780,6 +784,7 @@ ENTRY(\sym)
call interrupt_entry
UNWIND_HINT_REGS indirect=1
call \do_sym /* rdi points to pt_regs */
+ TASK_CONTEXT_RESET_BIT context=TASK_CTX_IRQ
jmp ret_from_intr
END(\sym)
_ASM_NOKPROBE(\sym)
@@ -1403,9 +1408,11 @@ ENTRY(nmi)
* done with the NMI stack.
*/
+ TASK_CONTEXT_SET_BIT context=TASK_CTX_NMI
movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
+ TASK_CONTEXT_RESET_BIT context=TASK_CTX_NMI
/*
* Return back to user mode. We must *not* do the normal exit
@@ -1615,10 +1622,12 @@ end_repeat_nmi:
call paranoid_entry
UNWIND_HINT_REGS
+ TASK_CONTEXT_SET_BIT context=TASK_CTX_NMI
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
+ TASK_CONTEXT_RESET_BIT context=TASK_CTX_NMI
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 058e40fed167..5a12bc3ea02b 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -3,6 +3,7 @@
#define _X86_IRQFLAGS_H_
#include <asm/processor-flags.h>
+#include <asm/percpu.h>
#ifndef __ASSEMBLY__
@@ -202,4 +203,33 @@ static inline int arch_irqs_disabled(void)
#endif
#endif /* __ASSEMBLY__ */
+#ifdef CONFIG_X86_64
+/*
+ * NOTE: I know I need to implement this to the 32 bits as well.
+ * But... this is just a POC.
+ */
+#define ARCH_HAS_TASK_CONTEXT 1
+
+#define TASK_CTX_THREAD 0x0
+#define TASK_CTX_SOFTIRQ 0x1
+#define TASK_CTX_IRQ 0x2
+#define TASK_CTX_NMI 0x4
+
+#ifdef __ASSEMBLY__
+.macro TASK_CONTEXT_SET_BIT context:req
+ orb $\context, PER_CPU_VAR(task_context)
+.endm
+
+.macro TASK_CONTEXT_RESET_BIT context:req
+ andb $~\context, PER_CPU_VAR(task_context)
+.endm
+#else /* __ASSEMBLY__ */
+DECLARE_PER_CPU(unsigned char, task_context);
+
+static __always_inline void task_context_set(unsigned char context)
+{
+ raw_cpu_write_1(task_context, context);
+}
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_X86_64 */
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb28e98a0659..1acbec22319b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1531,6 +1531,8 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned char, task_context) __visible = 0;
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -1604,6 +1606,8 @@ EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned char, task_context) __visible = 0;
+
/*
* On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
* the top of the kernel stack. Use an extra percpu variable to track the
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 21619c92c377..1c3473bbe5d2 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -168,4 +168,8 @@ do { \
#define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
+#ifndef ARCH_HAS_TASK_CONTEXT
+#define task_context_set(context) do {} while (0)
+#endif
+
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 10277429ed84..324de769dc07 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -410,8 +410,11 @@ void irq_exit(void)
#endif
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt() && local_softirq_pending()) {
+ task_context_set(TASK_CTX_SOFTIRQ);
invoke_softirq();
+ task_context_set(TASK_CTX_IRQ);
+ }
tick_irq_exit();
rcu_irq_exit();
--
2.20.1