x86_64 has very clean espfix handling on paravirt: espfix64 is set
up in native_iret, so paravirt systems that override iret bypass
espfix64 automatically. This is robust and straightforward.
x86_32 is messier. espfix is set up before the IRET paravirt patch
point, so it can't be directly conditionalized on whether we use
native_iret. We also can't easily move it into native_iret without
regressing performance due to a bizarre consideration. Specifically,
on 64-bit kernels, the logic is:
if (regs->ss & 0x4)
setup_espfix;
On 32-bit kernels, the logic is:
if ((regs->ss & 0x4) && (regs->cs & 0x3) == 3 &&
(regs->flags & X86_EFLAGS_VM) == 0)
setup_espfix;
The performance of setup_espfix itself is essentially irrelevant, but
the comparison happens on every IRET so its performance matters. On
x86_64, there's no need for any registers except flags to implement
the comparison, so we fold the whole thing into native_iret. On
x86_32, we don't do that because we need a free register to
implement the comparison efficiently. We therefore do espfix setup
before restoring registers on x86_32.
This patch gets rid of the explicit paravirt_enabled check by
introducing X86_BUG_ESPFIX on 32-bit systems and using an ALTERNATIVE
to skip espfix on paravirt systems where iret != native_iret. This is
also messy, but it's at least in line with other things we do.
This improves espfix performance by removing a branch, but no one
cares. More importantly, it removes a paravirt_enabled user, which is
good because paravirt_enabled is ill-defined and is going away.
Signed-off-by: Andy Lutomirski <***@kernel.org>
---
arch/x86/entry/entry_32.S | 15 ++-------------
arch/x86/include/asm/cpufeatures.h | 8 ++++++++
arch/x86/kernel/cpu/common.c | 25 +++++++++++++++++++++++++
3 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6a792603f9f3..4f7615e405a4 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -418,6 +418,8 @@ restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
+ ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
+
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
/*
* Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -444,19 +446,6 @@ ENTRY(iret_exc )
#ifdef CONFIG_X86_ESPFIX32
ldt_ss:
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
/*
* Setup and switch to ESPFIX stack
*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6663fae71b12..d11a3aaafd96 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -286,4 +286,12 @@
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 91ddae732a36..c6ef4da8e4f4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -979,6 +979,31 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
+
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+ do {
+ extern void native_iret(void);
+ if (pv_cpu_ops.iret == native_iret)
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+ } while (0);
+#else
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
+#endif
}
/*
--
2.5.0