summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c3
-rw-r--r--arch/x86/events/intel/core.c9
-rw-r--r--arch/x86/events/intel/ds.c5
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c28
-rw-r--r--arch/x86/hyperv/hv_init.c8
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h8
-rw-r--r--arch/x86/include/asm/insn-eval.h1
-rw-r--r--arch/x86/include/asm/irq_stack.h37
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/stacktrace.h10
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/cc_platform.c69
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c44
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/hygon.c2
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c5
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c12
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/setup.c66
-rw-r--r--arch/x86/kernel/sev.c32
-rw-r--r--arch/x86/kernel/traps.c60
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kvm/cpuid.c47
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/ioapic.h4
-rw-r--r--arch/x86/kvm/mmu/mmu.c7
-rw-r--r--arch/x86/kvm/mmu/spte.h7
-rw-r--r--arch/x86/kvm/svm/sev.c7
-rw-r--r--arch/x86/kvm/vmx/nested.c125
-rw-r--r--arch/x86/kvm/vmx/vmx.c68
-rw-r--r--arch/x86/kvm/vmx/vmx.h63
-rw-r--r--arch/x86/kvm/x86.c118
-rw-r--r--arch/x86/kvm/x86.h12
-rw-r--r--arch/x86/kvm/xen.c4
-rw-r--r--arch/x86/lib/insn-eval.c2
-rw-r--r--arch/x86/lib/insn.c5
-rw-r--r--arch/x86/mm/cpu_entry_area.c7
-rw-r--r--arch/x86/mm/fault.c20
-rw-r--r--arch/x86/mm/mem_encrypt.c1
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c9
51 files changed, 630 insertions, 326 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9830e7e1060..6ce906815bb2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1256,7 +1256,8 @@ config TOSHIBA
config I8K
tristate "Dell i8k legacy laptop support"
- select HWMON
+ depends on HWMON
+ depends on PROC_FS
select SENSORS_DELL_SMM
help
This option enables legacy /proc/i8k userspace interface in hwmon
@@ -1518,6 +1519,7 @@ config AMD_MEM_ENCRYPT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ select ARCH_HAS_CC_PLATFORM
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 0fc961bef299..e09f4672dd38 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -866,7 +866,7 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
- if (err)
+ if (!walk.nbytes)
return err;
} else {
tail = 0;
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1b40b9297083..fd2ee9408e91 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -226,7 +226,8 @@ bool emulate_vsyscall(unsigned long error_code,
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
- do_exit(SIGSYS);
+ force_exit_sig(SIGSYS);
+ return true;
}
regs->orig_ax = -1;
if (tmp)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9a044438072b..c7f1cc433a6a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -243,7 +243,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
static struct event_constraint intel_icl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -288,7 +289,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
static struct event_constraint intel_spr_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
@@ -2998,8 +2999,10 @@ intel_vlbr_constraints(struct perf_event *event)
{
struct event_constraint *c = &vlbr_constraint;
- if (unlikely(constraint_match(c, event->hw.config)))
+ if (unlikely(constraint_match(c, event->hw.config))) {
+ event->hw.flags |= c->flags;
return c;
+ }
return NULL;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8647713276a7..4dbb55a43dad 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -923,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = {
};
struct event_constraint intel_icl_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */
INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -943,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
};
struct event_constraint intel_spr_pebs_event_constraints[] = {
- INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 7280c8a3c831..6d735611c281 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -30,7 +30,7 @@
#define uncore_discovery_invalid_unit(unit) \
- (!unit.table1 || !unit.ctl || !unit.table3 || \
+ (!unit.table1 || !unit.ctl || \
unit.table1 == -1ULL || unit.ctl == -1ULL || \
unit.table3 == -1ULL)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 5ddc0f30db6f..3660f698fb2a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -452,7 +452,7 @@
#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0
/* ICX IMC */
-#define ICX_NUMBER_IMC_CHN 2
+#define ICX_NUMBER_IMC_CHN 3
#define ICX_IMC_MEM_STRIDE 0x4
/* SPR */
@@ -3608,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
struct extra_reg *er;
int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
if (er->event != (event->hw.config & er->config_mask))
@@ -3675,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
@@ -4525,6 +4529,13 @@ static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
}
+static struct event_constraint snr_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
static struct intel_uncore_type snr_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4536,6 +4547,7 @@ static struct intel_uncore_type snr_uncore_iio = {
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.box_ctl = SNR_IIO_MSR_PMON_BOX_CTL,
.msr_offset = SNR_IIO_MSR_OFFSET,
+ .constraints = snr_uncore_iio_constraints,
.ops = &ivbep_uncore_msr_ops,
.format_group = &snr_uncore_iio_format_group,
.attr_update = snr_iio_attr_update,
@@ -5076,8 +5088,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
UNCORE_EVENT_CONSTRAINT(0x03, 0x3),
UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
EVENT_CONSTRAINT_END
};
@@ -5463,7 +5477,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = {
static struct intel_uncore_type icx_uncore_imc = {
.name = "imc",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 12,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
@@ -5647,6 +5661,7 @@ static struct intel_uncore_type spr_uncore_chabox = {
.event_mask = SPR_CHA_PMON_EVENT_MASK,
.event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
.num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
.ops = &spr_uncore_chabox_ops,
.format_group = &spr_uncore_chabox_format_group,
.attr_update = uncore_alias_groups,
@@ -5658,6 +5673,7 @@ static struct intel_uncore_type spr_uncore_iio = {
.event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT,
.format_group = &snr_uncore_iio_format_group,
.attr_update = uncore_alias_groups,
+ .constraints = icx_uncore_iio_constraints,
};
static struct attribute *spr_uncore_raw_formats_attr[] = {
@@ -5686,9 +5702,16 @@ static struct intel_uncore_type spr_uncore_irp = {
};
+static struct event_constraint spr_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
static struct intel_uncore_type spr_uncore_m2pcie = {
SPR_UNCORE_COMMON_FORMAT(),
.name = "m2pcie",
+ .constraints = spr_uncore_m2pcie_constraints,
};
static struct intel_uncore_type spr_uncore_pcu = {
@@ -5765,6 +5788,7 @@ static struct intel_uncore_type spr_uncore_upi = {
static struct intel_uncore_type spr_uncore_m3upi = {
SPR_UNCORE_PCI_COMMON_FORMAT(),
.name = "m3upi",
+ .constraints = icx_uncore_m3upi_constraints,
};
static struct intel_uncore_type spr_uncore_mdf = {
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 708a2712a516..d20eef3d452a 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -139,7 +139,6 @@ void set_hv_tscchange_cb(void (*cb)(void))
struct hv_reenlightenment_control re_ctrl = {
.vector = HYPERV_REENLIGHTENMENT_VECTOR,
.enabled = 1,
- .target_vp = hv_vp_index[smp_processor_id()]
};
struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
@@ -148,13 +147,20 @@ void set_hv_tscchange_cb(void (*cb)(void))
return;
}
+ if (!hv_vp_index)
+ return;
+
hv_reenlightenment_cb = cb;
/* Make sure callback is registered before we write to MSRs */
wmb();
+ re_ctrl.target_vp = hv_vp_index[get_cpu()];
+
wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+
+ put_cpu();
}
EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 3d52b094850a..dd5ea1bdf04c 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -10,6 +10,12 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#define VC_EXCEPTION_STKSZ EXCEPTION_STKSZ
+#else
+#define VC_EXCEPTION_STKSZ 0
+#endif
+
/* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
char DF_stack_guard[guardsize]; \
@@ -28,7 +34,7 @@
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
- ESTACKS_MEMBERS(0, 0)
+ ESTACKS_MEMBERS(0, VC_EXCEPTION_STKSZ)
};
/* The effective cpu entry area mapping with guard pages. */
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 91d7182ad2d6..4ec3613551e3 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -21,6 +21,7 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 562854c60808..8d55bd11848c 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -77,11 +77,11 @@
* Function calls can clobber anything except the callee-saved
* registers. Tell the compiler.
*/
-#define call_on_irqstack(func, asm_call, argconstr...) \
+#define call_on_stack(stack, func, asm_call, argconstr...) \
{ \
register void *tos asm("r11"); \
\
- tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \
+ tos = ((void *)(stack)); \
\
asm_inline volatile( \
"movq %%rsp, (%[tos]) \n" \
@@ -98,6 +98,25 @@
); \
}
+#define ASM_CALL_ARG0 \
+ "call %P[__func] \n"
+
+#define ASM_CALL_ARG1 \
+ "movq %[arg1], %%rdi \n" \
+ ASM_CALL_ARG0
+
+#define ASM_CALL_ARG2 \
+ "movq %[arg2], %%rsi \n" \
+ ASM_CALL_ARG1
+
+#define ASM_CALL_ARG3 \
+ "movq %[arg3], %%rdx \n" \
+ ASM_CALL_ARG2
+
+#define call_on_irqstack(func, asm_call, argconstr...) \
+ call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ func, asm_call, argconstr)
+
/* Macros to assert type correctness for run_*_on_irqstack macros */
#define assert_function_type(func, proto) \
static_assert(__builtin_types_compatible_p(typeof(&func), proto))
@@ -147,8 +166,7 @@
*/
#define ASM_CALL_SYSVEC \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG1 \
"call irq_exit_rcu \n"
#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
@@ -168,12 +186,10 @@
*/
#define ASM_CALL_IRQ \
"call irq_enter_rcu \n" \
- "movq %[arg1], %%rdi \n" \
- "movl %[arg2], %%esi \n" \
- "call %P[__func] \n" \
+ ASM_CALL_ARG2 \
"call irq_exit_rcu \n"
-#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector)
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" ((unsigned long)vector)
#define run_irq_on_irqstack_cond(func, regs, vector) \
{ \
@@ -185,9 +201,6 @@
IRQ_CONSTRAINTS, regs, vector); \
}
-#define ASM_CALL_SOFTIRQ \
- "call %P[__func] \n"
-
/*
* Macro to invoke __do_softirq on the irq stack. This is only called from
* task context when bottom halves are about to be reenabled and soft
@@ -197,7 +210,7 @@
#define do_softirq_own_stack() \
{ \
__this_cpu_write(hardirq_stack_inuse, true); \
- call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \
+ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
__this_cpu_write(hardirq_stack_inuse, false); \
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 13f64654dfff..fa24fd4d138d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -364,6 +364,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
+ unsigned int efer_lma:1;
};
};
@@ -751,7 +752,7 @@ struct kvm_vcpu_arch {
u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_pfn_cache cache;
+ struct gfn_to_hva_cache cache;
} st;
u64 l1_tsc_offset;
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 9c80c68d75b5..3fb9f5ebefa4 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -13,6 +13,7 @@
#ifndef __ASSEMBLY__
#include <linux/init.h>
+#include <linux/cc_platform.h>
#include <asm/bootparam.h>
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index a8d4ad856568..e9e2c3ba5923 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -15,7 +15,7 @@
#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
-#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9ad2acaaae9b..577f342dbfb2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -518,6 +518,7 @@ struct thread_struct {
*/
unsigned long iopl_emul;
+ unsigned int iopl_warn:1;
unsigned int sig_on_uaccess_err:1;
/*
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f248eb2ac2d4..3881b5333eb8 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+static __always_inline
+bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
+{
+ /* make sure it's not in the stack proper */
+ if (get_stack_info_noinstr(stack, current, info))
+ return false;
+ /* but if it is in the page below it, we hit a guard */
+ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
+}
+
const char *stack_type_name(enum stack_type type);
static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7f7200021bd1..6221be7cafc3 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,9 +40,9 @@ void math_emulate(struct math_emu_info *);
bool fault_in_kernel_space(unsigned long address);
#ifdef CONFIG_VMAP_STACK
-void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address);
+void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info);
#endif
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f4e8fa6ed75..2ff3e600f426 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_sev.o = -pg
+CFLAGS_REMOVE_cc_platform.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
KASAN_SANITIZE_sev.o := n
+KASAN_SANITIZE_cc_platform.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -47,6 +49,7 @@ endif
KCOV_INSTRUMENT := n
CFLAGS_head$(BITS).o += -fno-stack-protector
+CFLAGS_cc_platform.o += -fno-stack-protector
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
@@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
new file mode 100644
index 000000000000..03bb2f343ddb
--- /dev/null
+++ b/arch/x86/kernel/cc_platform.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/export.h>
+#include <linux/cc_platform.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/processor.h>
+
+static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_INTEL_TDX_GUEST
+ return false;
+#else
+ return false;
+#endif
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this. When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return sme_me_mask;
+
+ case CC_ATTR_HOST_MEM_ENCRYPT:
+ return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+
+ case CC_ATTR_GUEST_STATE_ENCRYPT:
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+ default:
+ return false;
+ }
+#else
+ return false;
+#endif
+}
+
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ if (sme_me_mask)
+ return amd_cc_platform_has(attr);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b3410f1ac217..58b1416c05da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1396,9 +1396,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1419,10 +1418,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1458,8 +1490,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index acfd5d9f93c6..bb9a46a804bf 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -547,12 +547,13 @@ bool intel_filter_mce(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
if ((c->x86 == 6) &&
((c->x86_model == INTEL_FAM6_HASWELL) ||
(c->x86_model == INTEL_FAM6_HASWELL_L) ||
(c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+ (c->x86_model == INTEL_FAM6_HASWELL_G) ||
+ (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 63d3de02bbcc..8471a8b9b48e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
static LIST_HEAD(sgx_active_page_list);
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
-/* The free page list lock protected variables prepend the lock. */
-static unsigned long sgx_nr_free_pages;
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
/* Nodes with one or more EPC sections. */
static nodemask_t sgx_numa_mask;
@@ -403,14 +402,15 @@ skip:
spin_lock(&node->lock);
list_add_tail(&epc_page->list, &node->free_page_list);
- sgx_nr_free_pages++;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
@@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- sgx_nr_free_pages--;
spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
return page;
}
@@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
list_add_tail(&page->list, &node->free_page_list);
- sgx_nr_free_pages++;
spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5601b95944fa..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
+
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e28f6a5d14f1..766ffe3ba313 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
if (handler)
kvm_posted_intr_wakeup_handler = handler;
- else
+ else {
kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d9463e3096b..f2f733bcb2b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -132,6 +132,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 40ed44ead063..d71267081153 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -742,6 +742,28 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
+static char *prepare_command_line(void)
+{
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+
+ parse_early_param();
+
+ return command_line;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -831,6 +853,23 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
/*
+ * x86_configure_nx() is called before parse_early_param() (called by
+ * prepare_command_line()) to detect whether hardware doesn't support
+ * NX (so that the early EHCI debug console setup can safely call
+ * set_fixmap()). It may then be called again from within noexec_setup()
+ * during parsing early parameters to honor the respective command line
+ * option.
+ */
+ x86_configure_nx();
+
+ /*
+ * This parses early params and it needs to run before
+ * early_reserve_memory() because latter relies on such settings
+ * supplied as early params.
+ */
+ *cmdline_p = prepare_command_line();
+
+ /*
* Do some memory reservations *before* memory is added to memblock, so
* memblock allocations won't overwrite it.
*
@@ -863,33 +902,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
- /*
- * x86_configure_nx() is called before parse_early_param() to detect
- * whether hardware doesn't support NX (so that the early EHCI debug
- * console setup can safely call set_fixmap()). It may then be called
- * again from within noexec_setup() during parsing early parameters
- * to honor the respective command line option.
- */
- x86_configure_nx();
-
- parse_early_param();
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a6895e440bc3..88401675dabb 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -46,16 +46,6 @@ static struct ghcb __initdata *boot_ghcb;
struct sev_es_runtime_data {
struct ghcb ghcb_page;
- /* Physical storage for the per-CPU IST stack of the #VC handler */
- char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
- /*
- * Physical storage for the per-CPU fall-back stack of the #VC handler.
- * The fall-back stack is used when it is not safe to switch back to the
- * interrupted stack in the #VC entry code.
- */
- char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
-
/*
* Reserve one page per CPU as backup storage for the unencrypted GHCB.
* It is needed when an NMI happens while the #VC handler uses the real
@@ -99,27 +89,6 @@ DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
-static void __init setup_vc_stacks(int cpu)
-{
- struct sev_es_runtime_data *data;
- struct cpu_entry_area *cea;
- unsigned long vaddr;
- phys_addr_t pa;
-
- data = per_cpu(runtime_data, cpu);
- cea = get_cpu_entry_area(cpu);
-
- /* Map #VC IST stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
- pa = __pa(data->ist_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-
- /* Map VC fall-back stack */
- vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
- pa = __pa(data->fallback_stack);
- cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
-}
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -787,7 +756,6 @@ void __init sev_es_init_vc_handling(void)
for_each_possible_cpu(cpu) {
alloc_runtime_data(cpu);
init_ghcb(cpu);
- setup_vc_stacks(cpu);
}
sev_es_setup_play_dead();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..5b1984d46822 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -313,17 +313,19 @@ out:
}
#ifdef CONFIG_VMAP_STACK
-__visible void __noreturn handle_stack_overflow(const char *message,
- struct pt_regs *regs,
- unsigned long fault_address)
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
{
- printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
- (void *)fault_address, current->stack,
- (char *)current->stack + THREAD_SIZE - 1);
- die(message, regs, 0);
+ const char *name = stack_type_name(info->type);
+
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
+
+ die("stack guard page", regs, 0);
/* Be absolutely certain we don't return. */
- panic("%s", message);
+ panic("%s stack guard hit", name);
}
#endif
@@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
#ifdef CONFIG_VMAP_STACK
unsigned long address = read_cr2();
+ struct stack_info info;
#endif
#ifdef CONFIG_X86_ESPFIX64
@@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
* stack even if the actual trigger for the double fault was
* something else.
*/
- if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
- handle_stack_overflow("kernel stack overflow (double-fault)",
- regs, address);
- }
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
@@ -528,6 +529,36 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
+ }
+
+ regs->ip += 1;
+ return true;
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -553,6 +584,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk = current;
if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -709,7 +743,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
stack = (unsigned long *)sp;
if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
- info.type >= STACK_TYPE_EXCEPTION_LAST)
+ info.type > STACK_TYPE_EXCEPTION_LAST)
sp = __this_cpu_ist_top_va(VC2);
sync:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e5a7a10a0164..17d58740891e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -142,6 +142,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
user_access_end();
+exit_vm86:
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
@@ -161,7 +162,8 @@ Efault_end:
user_access_end();
Efault:
pr_alert("could not access userspace vm86 info\n");
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
+ goto exit_vm86;
}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 751aa85a3001..f666fd79d8ad 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -232,6 +232,25 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
}
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ int r;
+
+ r = kvm_check_cpuid(e2, nent);
+ if (r)
+ return r;
+
+ kvfree(vcpu->arch.cpuid_entries);
+ vcpu->arch.cpuid_entries = e2;
+ vcpu->arch.cpuid_nent = nent;
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+ return 0;
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -268,18 +287,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
e2[i].padding[2] = 0;
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- goto out_free_cpuid;
- }
-
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
out_free_cpuid:
kvfree(e);
@@ -303,20 +313,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
return PTR_ERR(e2);
}
- r = kvm_check_cpuid(e2, cpuid->nent);
- if (r) {
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
kvfree(e2);
- return r;
- }
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = cpuid->nent;
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
-
- return 0;
+ return r;
}
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index d5124b520f76..b9a13dc211d5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2022,7 +2022,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
bool longmode;
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (longmode)
kvm_rax_write(vcpu, result);
else {
@@ -2171,7 +2171,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_X86_64
- if (is_64_bit_mode(vcpu)) {
+ if (is_64_bit_hypercall(vcpu)) {
hc.param = kvm_rcx_read(vcpu);
hc.ingpa = kvm_rdx_read(vcpu);
hc.outgpa = kvm_r8_read(vcpu);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 8c065da73f8e..4e0f52660842 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index bbd4a5d18b5d..27e61ff3ac3e 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -39,13 +39,13 @@ struct kvm_vcpu;
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID + 1];
+ u8 vectors[KVM_MAX_VCPU_ID];
};
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0cc58901bf7a..0e0f960a0bf8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4679,6 +4679,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
/* PKEY and LA57 are active iff long mode is active. */
ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ ext.efer_lma = ____is_efer_lma(regs);
}
ext.valid = 1;
@@ -5758,13 +5759,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush);
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index eb7b227fc6cf..31d6456d8ac3 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
u64 spte, int level)
{
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the reserved
- * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc
- * (this is extremely unlikely to be short-circuited as true).
- */
- return __is_bad_mt_xwr(rsvd_check, spte) |
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
__is_rsvd_bits_set(rsvd_check, spte, level);
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7e34d7163ada..ff19ce0780fe 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1787,7 +1787,12 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mutex_unlock(&source_kvm->lock);
mutex_lock(&kvm->lock);
- if (sev_guest(kvm)) {
+ /*
+ * Disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || kvm->created_vcpus) {
ret = -EINVAL;
goto e_mirror_unlock;
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index eedcebf58004..8936d8383b2e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -524,29 +524,6 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
}
/*
- * Check if MSR is intercepted for L01 MSR bitmap.
- */
-static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
-{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return true;
-
- msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
-}
-
-/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
*/
@@ -599,6 +576,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
}
}
+#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \
+static inline \
+void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \
+ unsigned long *msr_bitmap_l1, \
+ unsigned long *msr_bitmap_l0, u32 msr) \
+{ \
+ if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \
+ vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \
+ vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+ else \
+ vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \
+}
+BUILD_NVMX_MSR_INTERCEPT_HELPER(read)
+BUILD_NVMX_MSR_INTERCEPT_HELPER(write)
+
+static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
+ unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_l0,
+ u32 msr, int types)
+{
+ if (types & MSR_TYPE_R)
+ nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+ if (types & MSR_TYPE_W)
+ nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
+ msr_bitmap_l0, msr);
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -606,10 +611,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
int msr;
unsigned long *msr_bitmap_l1;
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
- struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+ unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -660,44 +666,27 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
}
}
- /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+ /*
+ * Always check vmcs01's bitmap to honor userspace MSR filters and any
+ * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through.
+ */
#ifdef CONFIG_X86_64
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_FS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_FS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_GS_BASE, MSR_TYPE_RW);
- nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
- MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
#endif
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
- /*
- * Checking the L0->L1 bitmap is trying to verify two things:
- *
- * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
- * ensures that we do not accidentally generate an L02 MSR bitmap
- * from the L12 MSR bitmap that is too permissive.
- * 2. That L1 or L2s have actually used the MSR. This avoids
- * unnecessarily merging of the bitmap if the MSR is unused. This
- * works properly because we only update the L01 MSR bitmap lazily.
- * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
- * updated to reflect this when L1 (or its L2s) actually write to
- * the MSR.
- */
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_SPEC_CTRL,
- MSR_TYPE_R | MSR_TYPE_W);
-
- if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD, MSR_TYPE_W);
- kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
return true;
}
@@ -2865,6 +2854,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -2889,18 +2889,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_X86_64
- ia32e = !!(vcpu->arch.efer & EFER_LMA);
+ ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
#else
ia32e = false;
#endif
if (ia32e) {
- if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
- CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
} else {
- if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
- CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
CC((vmcs12->host_rip) >> 32))
return -EINVAL;
@@ -3570,6 +3568,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (nested_vmx_check_controls(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+
if (nested_vmx_check_host_state(vcpu, vmcs12))
return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7d595effb66f..671ff4150fcb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -769,24 +769,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
-
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
-
- return true;
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap,
+ MSR_IA32_SPEC_CTRL);
}
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -3695,46 +3684,6 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
-}
-
-static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
-}
-
void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6720,7 +6669,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
@@ -7551,6 +7500,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7879,8 +7830,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7904,6 +7853,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 592217fd7d92..3f9c8548625d 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -400,6 +400,69 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+static inline bool vmx_test_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ return test_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ return test_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+ return true;
+}
+
+static inline bool vmx_test_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ return test_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ return test_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+ return true;
+}
+
+static inline void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static inline void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static inline void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static inline void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+
static inline u8 vmx_get_rvi(void)
{
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfe0de3008a6..16c7465619f7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3195,8 +3195,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ u64 steal;
+ u32 version;
if (kvm_xen_msr_enabled(vcpu->kvm)) {
kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3209,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- /* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+ st = (struct kvm_steal_time __user *)ghc->hva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
- u8 st_preempted = xchg(&st->preempted, 0);
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
st_preempted & KVM_VCPU_FLUSH_TLB);
if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
} else {
- st->preempted = 0;
- }
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
- vcpu->arch.st.preempted = 0;
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
- if (st->version & 1)
- st->version += 1; /* first time write, random junk */
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
smp_wmb();
- st->steal += current->sched_info.run_delay -
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
- smp_wmb();
-
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4285,8 +4327,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4294,16 +4338,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
+ return;
+
+ slots = kvm_memslots(vcpu->kvm);
+
+ if (unlikely(slots->generation != ghc->generation ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -8686,7 +8737,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_mode(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -9378,12 +9429,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- if (to_hv_vcpu(vcpu))
+ if (to_hv_vcpu(vcpu)) {
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
+ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ return;
+ }
- static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ static_call(kvm_x86_load_eoi_exitmap)(
+ vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -10817,11 +10872,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
int idx;
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
kvmclock_reset(vcpu);
static_call(kvm_x86_vcpu_free)(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7d66d63dc55a..9776a1db8ea8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -153,12 +153,24 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
{
int cs_db, cs_l;
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected);
+
if (!is_long_mode(vcpu))
return false;
static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
+static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If running with protected guest state, the CS register is not
+ * accessible. The hypercall register values will have had to been
+ * provided in 64-bit mode, so assume the guest is in 64-bit.
+ */
+ return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu);
+}
+
static inline bool x86_exception_has_error_code(unsigned int vector)
{
static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8f62baebd028..0787d6645573 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -299,7 +299,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
+ data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn;
r = 0;
break;
@@ -698,7 +698,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- longmode = is_64_bit_mode(vcpu);
+ longmode = is_64_bit_hypercall(vcpu);
if (!longmode) {
params[0] = (u32)kvm_rbx_read(vcpu);
params[1] = (u32)kvm_rcx_read(vcpu);
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index a1d24fdc07cf..eb3ccffb9b9d 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1417,7 +1417,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
-static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index c565def611e2..55e371cc69fd 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -13,6 +13,7 @@
#endif
#include <asm/inat.h> /*__ignore_sync_check__ */
#include <asm/insn.h> /* __ignore_sync_check__ */
+#include <asm/unaligned.h> /* __ignore_sync_check__ */
#include <linux/errno.h>
#include <linux/kconfig.h>
@@ -37,10 +38,10 @@
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); })
+ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index f5e1e60c9095..6c2f1b76a0b6 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -110,6 +110,13 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
cea_map_stack(NMI);
cea_map_stack(DB);
cea_map_stack(MCE);
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
+ cea_map_stack(VC);
+ cea_map_stack(VC2);
+ }
+ }
}
#else
static inline void percpu_setup_exception_stacks(unsigned int cpu)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 84a2c8c4af73..4bfed53e210e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -32,6 +32,7 @@
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
+#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -631,6 +632,9 @@ static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
+#ifdef CONFIG_VMAP_STACK
+ struct stack_info info;
+#endif
unsigned long flags;
int sig;
@@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
- unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
+ get_stack_guard_info((void *)address, &info)) {
/*
* We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but
@@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
* and then double-fault, though, because we're likely to
* break the console driver and lose most of the stack dump.
*/
- asm volatile ("movq %[stack], %%rsp\n\t"
- "call handle_stack_overflow\n\t"
- "1: jmp 1b"
- : ASM_CALL_CONSTRAINT
- : "D" ("kernel stack overflow (page fault)"),
- "S" (regs), "d" (address),
- [stack] "rm" (stack));
+ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
+ handle_stack_overflow,
+ ASM_CALL_ARG3,
+ , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
+
unreachable();
}
#endif
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ff08dc463634..e29b1418d00c 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -20,6 +20,7 @@
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 470b20208430..700ce8fdea87 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -27,6 +27,15 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mem_encrypt.h>