From 0d55303c51a4f35f674617e415632d492b596c26 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:25 -0700 Subject: compat: Move compat_timespec/ timeval to compat_time.h All the current architecture specific defines for these are the same. Refactor these common defines to a common header file. The new common linux/compat_time.h is also useful as it will eventually be used to hold all the defines that are needed for compat time types that support non y2038 safe types. New architectures need not have to define these new types as they will only use new y2038 safe syscalls. This file can be deleted after y2038 when we stop supporting non y2038 safe syscalls. The patch also requires an operation similar to: git grep "asm/compat\.h" | cut -d ":" -f 1 | xargs -n 1 sed -i -e "s%asm/compat.h%linux/compat.h%g" Cc: acme@kernel.org Cc: benh@kernel.crashing.org Cc: borntraeger@de.ibm.com Cc: catalin.marinas@arm.com Cc: cmetcalf@mellanox.com Cc: cohuck@redhat.com Cc: davem@davemloft.net Cc: deller@gmx.de Cc: devel@driverdev.osuosl.org Cc: gerald.schaefer@de.ibm.com Cc: gregkh@linuxfoundation.org Cc: heiko.carstens@de.ibm.com Cc: hoeppner@linux.vnet.ibm.com Cc: hpa@zytor.com Cc: jejb@parisc-linux.org Cc: jwi@linux.vnet.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: mark.rutland@arm.com Cc: mingo@redhat.com Cc: mpe@ellerman.id.au Cc: oberpar@linux.vnet.ibm.com Cc: oprofile-list@lists.sf.net Cc: paulus@samba.org Cc: peterz@infradead.org Cc: ralf@linux-mips.org Cc: rostedt@goodmis.org Cc: rric@kernel.org Cc: schwidefsky@de.ibm.com Cc: sebott@linux.vnet.ibm.com Cc: sparclinux@vger.kernel.org Cc: sth@linux.vnet.ibm.com Cc: ubraun@linux.vnet.ibm.com Cc: will.deacon@arm.com Cc: x86@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Acked-by: Steven Rostedt (VMware) Acked-by: Catalin Marinas Acked-by: James Hogan Acked-by: Helge Deller Signed-off-by: Arnd Bergmann --- arch/x86/kernel/sys_x86_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a3f15ed545b5..6a78d4b36a79 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -19,7 +20,6 @@ #include #include -#include #include #include #include -- cgit From 60882cc159e1416fb1d17210de60d4a3ba04e613 Mon Sep 17 00:00:00 2001 From: David Wang Date: Fri, 20 Apr 2018 16:29:28 +0800 Subject: x86/Centaur: Initialize supported CPU features properly Centaur CPUs have some Intel compatible capabilities,including Permformance Monitoring Counters and CPU virtualization capabilities. Initialize them in the Centaur specific init code. Signed-off-by: David Wang Signed-off-by: Thomas Gleixner Cc: lukelin@viacpu.com Cc: qiyuanwang@zhaoxin.com Cc: gregkh@linuxfoundation.org Cc: brucechang@via-alliance.com Cc: timguo@zhaoxin.com Cc: cooperyan@zhaoxin.com Cc: hpa@zytor.com Cc: benjaminpan@viatech.com Link: https://lkml.kernel.org/r/1524212968-28998-1-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/centaur.c | 48 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e5ec0f11c0de..80d5110481ec 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -18,6 +18,13 @@ #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -112,6 +119,31 @@ static void early_init_centaur(struct cpuinfo_x86 *c) } } +static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -128,6 +160,19 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, 0*32+31); #endif early_init_centaur(c); + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + switch (c->x86) { #ifdef CONFIG_X86_32 case 5: @@ -199,6 +244,9 @@ static void init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif + + if (cpu_has(c, X86_FEATURE_VMX)) + centaur_detect_vmx_virtcap(c); } #ifdef CONFIG_X86_32 -- cgit From 3eb0f5193b497083391aa05d35210d5645211eef Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 17 Apr 2018 15:26:37 -0500 Subject: signal: Ensure every siginfo we send has all bits initialized Call clear_siginfo to ensure every stack allocated siginfo is properly initialized before being passed to the signal sending functions. Note: It is not safe to depend on C initializers to initialize struct siginfo on the stack because C is allowed to skip holes when initializing a structure. The initialization of struct siginfo in tracehook_report_syscall_exit was moved from the helper user_single_step_siginfo into tracehook_report_syscall_exit itself, to make it clear that the local variable siginfo gets fully initialized. In a few cases the scope of struct siginfo has been reduced to make it clear that siginfo siginfo is not used on other paths in the function in which it is declared. Instances of using memset to initialize siginfo have been replaced with calls clear_siginfo for clarity. Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/traps.c | 3 +++ arch/x86/kernel/umip.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index ed5c4cdf0a34..e2ee403865eb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1377,7 +1377,6 @@ static void fill_sigtrap_info(struct task_struct *tsk, tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; - memset(info, 0, sizeof(*info)); info->si_signo = SIGTRAP; info->si_code = si_code; info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; @@ -1395,6 +1394,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, { struct siginfo info; + clear_siginfo(&info); fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 03f3d7695dac..a535dd64de63 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); + clear_siginfo(&info); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -854,6 +855,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.trap_nr = trapnr; task->thread.error_code = error_code; + clear_siginfo(&info); info.si_signo = SIGFPE; info.si_errno = 0; info.si_addr = (void __user *)uprobe_get_trap_addr(regs); @@ -929,6 +931,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); + clear_siginfo(&info); info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f44ce0fb3583..ff20b35e98dd 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -278,6 +278,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; + clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_MAPERR; -- cgit From db78e6a0a6f9f7d7277965600eeb1a5b3a6f55a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 17 Apr 2018 16:18:25 -0500 Subject: signal: Add TRAP_UNK si_code for undiagnosted trap exceptions Both powerpc and alpha have cases where they wronly set si_code to 0 in combination with SIGTRAP and don't mean SI_USER. About half the time this is because the architecture can not report accurately what kind of trap exception triggered the trap exception. The other half the time it looks like no one has bothered to figure out an appropriate si_code. For the cases where the architecture does not have enough information or is too lazy to figure out exactly what kind of trap exception it is define TRAP_UNK. Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-alpha@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/signal_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 14c057f29979..9ccbf0576cd0 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 7); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); BUILD_BUG_ON(NSIGSYS != 1); -- cgit From 5d12f0424edf01ccd8abbcba1c7d45fe0b23c779 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:16 +0200 Subject: x86/dumpstack: Remove code_bytes This was added by 86c418374223 ("[PATCH] i386: add option to show more code in oops reports") long time ago but experience shows that 64 instruction bytes are plenty when deciphering an oops. So get rid of it. Removing it will simplify further enhancements to the opcodes dumping machinery coming in the following patches. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-2-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 18fa9d74c182..593db796374d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,9 +22,10 @@ #include #include +#define OPCODE_BUFSIZE 64 + int panic_on_unrecovered_nmi; int panic_on_io_nmi; -static unsigned int code_bytes = 64; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -356,26 +357,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init code_bytes_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - - code_bytes = val; - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); - void show_regs(struct pt_regs *regs) { bool all = true; @@ -393,8 +374,8 @@ void show_regs(struct pt_regs *regs) * time of the fault.. */ if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; + unsigned int code_prologue = OPCODE_BUFSIZE * 43 / 64; + unsigned int code_len = OPCODE_BUFSIZE; unsigned char c; u8 *ip; -- cgit From 5df61707f0bdf8dce714a14806740e6abf2114c7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:17 +0200 Subject: x86/dumpstack: Unexport oops_begin() The only user outside of arch/ is not a module since 86cd47334b00 ("ACPI, APEI, GHES, Prevent GHES to be built as module") No functional changes. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-3-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 593db796374d..579455c2b91e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,7 +268,6 @@ unsigned long oops_begin(void) bust_spinlocks(1); return flags; } -EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); void __noreturn rewind_stack_do_exit(int signr); -- cgit From f0a1d7c11c3ebe2f601b448d13e7fbc3a0364a03 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:18 +0200 Subject: x86/dumpstack: Carve out code-dumping into a function No functionality change, carve it out into a separate function for later changes. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-4-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 57 ++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 579455c2b91e..eb9d6c00a52f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -70,6 +70,35 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +static void show_opcodes(u8 *rip) +{ + unsigned int code_prologue = OPCODE_BUFSIZE * 43 / 64; + unsigned int code_len = OPCODE_BUFSIZE; + unsigned char c; + u8 *ip; + int i; + + printk(KERN_DEFAULT "Code: "); + + ip = (u8 *)rip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)rip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + pr_cont(" Bad RIP value."); + break; + } + if (ip == (u8 *)rip) + pr_cont("<%02x> ", c); + else + pr_cont("%02x ", c); + } + pr_cont("\n"); +} + void show_iret_regs(struct pt_regs *regs) { printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); @@ -359,7 +388,6 @@ void die(const char *str, struct pt_regs *regs, long err) void show_regs(struct pt_regs *regs) { bool all = true; - int i; show_regs_print_info(KERN_DEFAULT); @@ -373,32 +401,7 @@ void show_regs(struct pt_regs *regs) * time of the fault.. */ if (!user_mode(regs)) { - unsigned int code_prologue = OPCODE_BUFSIZE * 43 / 64; - unsigned int code_len = OPCODE_BUFSIZE; - unsigned char c; - u8 *ip; - show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - - printk(KERN_DEFAULT "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - pr_cont(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - pr_cont("<%02x> ", c); - else - pr_cont("%02x ", c); - } + show_opcodes((u8 *)regs->ip); } - pr_cont("\n"); } -- cgit From 9e4a90fd34445df64a13d136676a31a4dd22aea3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:19 +0200 Subject: x86/dumpstack: Improve opcodes dumping in the code section The code used to iterate byte-by-byte over the bytes around RIP and that is expensive: disabling pagefaults around it, copy_from_user, etc... Make it read the whole buffer of OPCODE_BUFSIZE size in one go. Use a statically allocated 64 bytes buffer so that concurrent show_opcodes() do not interleave in the output even though in the majority of the cases it's serialized via die_lock. Except the #PF path which doesn't... Also, do the PAGE_OFFSET check outside of the function because latter will be reused in other context. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-5-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index eb9d6c00a52f..1d6698b54527 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -72,29 +72,24 @@ static void printk_stack_address(unsigned long address, int reliable, static void show_opcodes(u8 *rip) { - unsigned int code_prologue = OPCODE_BUFSIZE * 43 / 64; - unsigned int code_len = OPCODE_BUFSIZE; - unsigned char c; + unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; + u8 opcodes[OPCODE_BUFSIZE]; u8 *ip; int i; printk(KERN_DEFAULT "Code: "); ip = (u8 *)rip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at IP */ - ip = (u8 *)rip; - code_len = code_len - code_prologue + 1; + if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) { + pr_cont("Bad RIP value.\n"); + return; } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - pr_cont(" Bad RIP value."); - break; - } - if (ip == (u8 *)rip) - pr_cont("<%02x> ", c); + + for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) { + if (ip == rip) + pr_cont("<%02x> ", opcodes[i]); else - pr_cont("%02x ", c); + pr_cont("%02x ", opcodes[i]); } pr_cont("\n"); } @@ -402,6 +397,10 @@ void show_regs(struct pt_regs *regs) */ if (!user_mode(regs)) { show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - show_opcodes((u8 *)regs->ip); + + if (regs->ip < PAGE_OFFSET) + printk(KERN_DEFAULT "Code: Bad RIP value.\n"); + else + show_opcodes((u8 *)regs->ip); } } -- cgit From e8b6f984516b1fcb0ccf4469ca42777c9c2dc76d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:20 +0200 Subject: x86/dumpstack: Add loglevel argument to show_opcodes() Will be used in the next patch. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-6-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1d6698b54527..1592d0c3ebb5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -70,14 +70,14 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } -static void show_opcodes(u8 *rip) +void show_opcodes(u8 *rip, const char *loglvl) { unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; u8 opcodes[OPCODE_BUFSIZE]; u8 *ip; int i; - printk(KERN_DEFAULT "Code: "); + printk("%sCode: ", loglvl); ip = (u8 *)rip - code_prologue; if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) { @@ -401,6 +401,6 @@ void show_regs(struct pt_regs *regs) if (regs->ip < PAGE_OFFSET) printk(KERN_DEFAULT "Code: Bad RIP value.\n"); else - show_opcodes((u8 *)regs->ip); + show_opcodes((u8 *)regs->ip, KERN_DEFAULT); } } -- cgit From 7cccf0725cf7402514e09c52b089430005798b7f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:22 +0200 Subject: x86/dumpstack: Add a show_ip() function ... which shows the Instruction Pointer along with the insn bytes around it. Use it whenever rIP is printed. Drop the rIP < PAGE_OFFSET check since probe_kernel_read() can handle any address properly. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-8-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 23 +++++++++++++---------- arch/x86/kernel/process_32.c | 8 +++----- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1592d0c3ebb5..82da808b5c36 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,9 +94,19 @@ void show_opcodes(u8 *rip, const char *loglvl) pr_cont("\n"); } +void show_ip(struct pt_regs *regs, const char *loglvl) +{ +#ifdef CONFIG_X86_32 + printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); +#else + printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); +#endif + show_opcodes((u8 *)regs->ip, loglvl); +} + void show_iret_regs(struct pt_regs *regs) { - printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, regs->sp, regs->flags); } @@ -392,15 +402,8 @@ void show_regs(struct pt_regs *regs) __show_regs(regs, all); /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. + * When in-kernel, we also print out the stack at the time of the fault.. */ - if (!user_mode(regs)) { + if (!user_mode(regs)) show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); - - if (regs->ip < PAGE_OFFSET) - printk(KERN_DEFAULT "Code: Bad RIP value.\n"); - else - show_opcodes((u8 *)regs->ip, KERN_DEFAULT); - } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5224c6099184..0ae659de21eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -76,16 +76,14 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); - printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, - raw_smp_processor_id()); + show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); if (!all) return; -- cgit From 602bd705da334f214fc03db328dc37d2f1f33307 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:23 +0200 Subject: x86/dumpstack: Save first regs set for the executive summary Save the regs set when __die() is onvoked for the first time and print it in oops_end(). Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-9-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 82da808b5c36..ee344030fd0a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -28,6 +28,8 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; static int die_counter; +static struct pt_regs exec_summary_regs; + bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info) { @@ -321,6 +323,9 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) raw_local_irq_restore(flags); oops_exit(); + /* Executive summary in case the oops scrolled away */ + __show_regs(&exec_summary_regs, true); + if (!signr) return; if (in_interrupt()) @@ -339,10 +344,10 @@ NOKPROBE_SYMBOL(oops_end); int __die(const char *str, struct pt_regs *regs, long err) { -#ifdef CONFIG_X86_32 - unsigned short ss; - unsigned long sp; -#endif + /* Save the regs of the first oops for the executive summary later. */ + if (!die_counter) + exec_summary_regs = *regs; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", @@ -352,26 +357,13 @@ int __die(const char *str, struct pt_regs *regs, long err) IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); + show_regs(regs); + print_modules(); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; - print_modules(); - show_regs(regs); -#ifdef CONFIG_X86_32 - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss; - } else { - sp = kernel_stack_pointer(regs); - savesegment(ss, ss); - } - printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", - (void *)regs->ip, ss, sp); -#else - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); -#endif return 0; } NOKPROBE_SYMBOL(__die); -- cgit From 4dba072cd097f35fa8f77c49d909ada2b079a4c4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 Apr 2018 18:11:24 +0200 Subject: x86/dumpstack: Explain the reasoning for the prologue and buffer size The whole reasoning behind the amount of opcode bytes dumped and prologue length isn't very clear so write down some of the reasons for why it is done the way it is. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180417161124.5294-10-bp@alien8.de --- arch/x86/kernel/dumpstack.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ee344030fd0a..666a284116ac 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -72,6 +72,25 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +/* + * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: + * + * In case where we don't have the exact kernel image (which, if we did, we can + * simply disassemble and navigate to the RIP), the purpose of the bigger + * prologue is to have more context and to be able to correlate the code from + * the different toolchains better. + * + * In addition, it helps in recreating the register allocation of the failing + * kernel and thus make sense of the register dump. + * + * What is more, the additional complication of a variable length insn arch like + * x86 warrants having longer byte sequence before rIP so that the disassembler + * can "sync" up properly and find instruction boundaries when decoding the + * opcode bytes. + * + * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random + * guesstimate in attempt to achieve all of the above. + */ void show_opcodes(u8 *rip, const char *loglvl) { unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3; -- cgit From 13e8582245267b872dc6eb4ab695fffc797d99f5 Mon Sep 17 00:00:00 2001 From: David Wang Date: Wed, 25 Apr 2018 18:33:39 +0800 Subject: x86/MCE: Enable MCE broadcasting on new Centaur CPUs Newer Centaur multi-core CPUs also support MCE broadcasting to all cores. Add a Centaur-specific init function setting that up. [ bp: - make mce_centaur_feature_init() static - flip check to do the f/m/s first for better readability - touch up text ] Signed-off-by: David Wang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: lukelin@viacpu.com Cc: qiyuanwang@zhaoxin.com Cc: Greg KH Cc: brucechang@via-alliance.com Cc: timguo@zhaoxin.com Cc: cooperyan@zhaoxin.com Cc: Tony Luck Cc: benjaminpan@viatech.com Cc: linux-edac Link: http://lkml.kernel.org/r/1524652420-17330-2-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 42cf2880d0ed..cd76380af79f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1727,6 +1727,21 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) } } +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + /* + * All newer Centaur CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || + c->x86 > 6) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1739,6 +1754,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_amd_feature_init(c); break; } + case X86_VENDOR_CENTAUR: + mce_centaur_feature_init(c); + break; default: break; -- cgit From 985c78d3ff8e9c74450fa2bb08eb55e680d999ca Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Fri, 27 Apr 2018 09:37:08 -0700 Subject: x86/MCE: Fix stack out-of-bounds write in mce-inject.c: Flags_read() Each of the strings that we want to put into the buf[MAX_FLAG_OPT_SIZE] in flags_read() is two characters long. But the sprintf() adds a trailing newline and will add a terminating NUL byte. So MAX_FLAG_OPT_SIZE needs to be 4. sprintf() calls vsnprintf() and *that* does return: " * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99." Note the "excluding". Reported-by: Dmitry Vyukov Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Cc: linux-edac Link: http://lkml.kernel.org/r/20180427163707.ktaiysvbk3yhk4wm@agluck-desk --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 475cb4f5f14f..c805a06e14c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -48,7 +48,7 @@ static struct dentry *dfs_inj; static u8 n_banks; -#define MAX_FLAG_OPT_SIZE 3 +#define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 enum injection_type { -- cgit From f8b64d08dde2714c62751d18ba77f4aeceb161d3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Apr 2018 16:34:34 -0500 Subject: x86/CPU/AMD: Have smp_num_siblings and cpu_llc_id always be present Move smp_num_siblings and cpu_llc_id to cpu/common.c so that they're always present as symbols and not only in the CONFIG_SMP case. Then, other code using them doesn't need ugly ifdeffery anymore. Get rid of some ifdeffery. Signed-off-by: Borislav Petkov Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1524864877-111962-2-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/amd.c | 10 +--------- arch/x86/kernel/cpu/common.c | 7 +++++++ arch/x86/kernel/smpboot.c | 7 ------- 3 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12bc0a1139da..a37a83809665 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -297,7 +297,6 @@ static int nearby_node(int apicid) } #endif -#ifdef CONFIG_SMP /* * Fix up cpu_core_id for pre-F17h systems to be in the * [0 .. cores_per_node - 1] range. Not really needed but @@ -375,7 +374,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) legacy_fixup_core_id(c); } } -#endif /* * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. @@ -383,7 +381,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -395,16 +392,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); -#endif } u16 amd_get_nb_id(int cpu) { - u16 id = 0; -#ifdef CONFIG_SMP - id = per_cpu(cpu_llc_id, cpu); -#endif - return id; + return per_cpu(cpu_llc_id, cpu); } EXPORT_SYMBOL_GPL(amd_get_nb_id); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8a5b185735e1..37c7c8334a00 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ff99e2b6fc54..91d48f3eeda8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -78,13 +78,6 @@ #include #include -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); - -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; - /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); -- cgit From 1d200c078d0e3e49e2995b9d25fef8926d491f4f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Apr 2018 16:34:36 -0500 Subject: x86/CPU: Rename intel_cacheinfo.c to cacheinfo.c Since this file contains general cache-related information for x86, rename the file to a more generic name. Signed-off-by: Borislav Petkov Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1524864877-111962-4-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/cacheinfo.c | 968 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel_cacheinfo.c | 968 ---------------------------------- 3 files changed, 969 insertions(+), 969 deletions(-) create mode 100644 arch/x86/kernel/cpu/cacheinfo.c delete mode 100644 arch/x86/kernel/cpu/intel_cacheinfo.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a66229f51b12..7a40196967cb 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c new file mode 100644 index 000000000000..54d04d574148 --- /dev/null +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -0,0 +1,968 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Routines to identify caches on Intel CPU. + * + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Ashok Raj : Work with CPU hotplug infrastructure. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define LVL_1_INST 1 +#define LVL_1_DATA 2 +#define LVL_2 3 +#define LVL_3 4 +#define LVL_TRACE 5 + +struct _cache_table { + unsigned char descriptor; + char cache_type; + short size; +}; + +#define MB(x) ((x) * 1024) + +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + +static const struct _cache_table cache_table[] = +{ + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ + { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ + { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ + { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ + { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ + { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ + { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ + { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ + { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ + { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ + { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ + { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ + { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ + { 0x00, 0, 0} +}; + + +enum _cache_type { + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 +}; + +union _cpuid4_leaf_eax { + struct { + enum _cache_type type:5; + unsigned int level:3; + unsigned int is_self_initializing:1; + unsigned int is_fully_associative:1; + unsigned int reserved:4; + unsigned int num_threads_sharing:12; + unsigned int num_cores_on_die:6; + } split; + u32 full; +}; + +union _cpuid4_leaf_ebx { + struct { + unsigned int coherency_line_size:12; + unsigned int physical_line_partition:10; + unsigned int ways_of_associativity:10; + } split; + u32 full; +}; + +union _cpuid4_leaf_ecx { + struct { + unsigned int number_of_sets:32; + } split; + u32 full; +}; + +struct _cpuid4_info_regs { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned int id; + unsigned long size; + struct amd_northbridge *nb; +}; + +static unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; + }; + unsigned val; +}; + +union l3_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; + }; + unsigned val; +}; + +static const unsigned short assocs[] = { + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; + +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +static void +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + case 0: + if (!l1->val) + return; + assoc = assocs[l1->assoc]; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + break; + case 2: + if (!l2.val) + return; + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + break; + case 3: + if (!l3.val) + return; + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } + break; + default: + return; + } + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + + + if (assoc == 0xffff) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} + +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, + unsigned int slot) +{ + int index; + struct amd_northbridge *nb = this_leaf->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return show_cache_disable(this_leaf, buf, slot); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + struct amd_northbridge *nb = this_leaf->priv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&this_leaf->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return store_cache_disable(this_leaf, buf, count, slot); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t +cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!this_leaf->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + int n = 1; + static struct attribute **amd_l3_attrs; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group * +cache_get_priv_group(struct cacheinfo *this_leaf) +{ + struct amd_northbridge *nb = this_leaf->priv; + + if (this_leaf->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = amd_get_nb_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} +#else +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ + +static int +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +{ + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + + if (eax.split.type == CTYPE_NULL) + return -EIO; /* better error ? */ + + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); + return 0; +} + +static int find_num_cache_leaves(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + + do { + ++i; + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + } while (cache_eax.split.type != CTYPE_NULL); + return i; +} + +void init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + +unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ + unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; +#ifdef CONFIG_SMP + unsigned int cpu = c->cpu_index; +#endif + + if (c->cpuid_level > 3) { + static int is_initialized; + + if (is_initialized == 0) { + /* Init num_cache_leaves from boot CPU */ + num_cache_leaves = find_num_cache_leaves(c); + is_initialized++; + } + + /* + * Whenever possible use cpuid(4), deterministic cache + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { + struct _cpuid4_info_regs this_leaf = {}; + int retval; + + retval = cpuid4_cache_lookup_regs(i, &this_leaf); + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CTYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CTYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; + } + } + } + /* + * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for + * trace cache + */ + if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + /* supports eax=2 call */ + int j, n; + unsigned int regs[4]; + unsigned char *dp = (unsigned char *)regs; + int only_trace = 0; + + if (num_cache_leaves != 0 && c->x86 == 15) + only_trace = 1; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) { + unsigned char des = dp[j]; + unsigned char k = 0; + + /* look up this descriptor in the table */ + while (cache_table[k].descriptor != 0) { + if (cache_table[k].descriptor == des) { + if (only_trace && cache_table[k].cache_type != LVL_TRACE) + break; + switch (cache_table[k].cache_type) { + case LVL_1_INST: + l1i += cache_table[k].size; + break; + case LVL_1_DATA: + l1d += cache_table[k].size; + break; + case LVL_2: + l2 += cache_table[k].size; + break; + case LVL_3: + l3 += cache_table[k].size; + break; + case LVL_TRACE: + trace += cache_table[k].size; + break; + } + + break; + } + + k++; + } + } + } + } + + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) { + l2 = new_l2; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l2_id; +#endif + } + + if (new_l3) { + l3 = new_l3; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l3_id; +#endif + } + +#ifdef CONFIG_SMP + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + + return l2; +} + +static int __cache_amd_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf; + int i, sibling; + + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + unsigned int apicid, nshared, first, last; + + nshared = base->eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; + + this_leaf = this_cpu_ci->info_list + index; + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else + return 0; + + return 1; +} + +static void __cache_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (__cache_amd_cpumap_setup(cpu, index, base)) + return; + } + + this_leaf = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + if (num_threads_sharing == 1) + return; + + index_msb = get_count_order(num_threads_sharing); + + for_each_online_cpu(i) + if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + + if (i == cpu || !sib_cpu_ci->info_list) + continue;/* skip if itself or no cacheinfo */ + sibling_leaf = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + } +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct _cpuid4_info_regs *base) +{ + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; + this_leaf->level = base->eax.split.level; + this_leaf->type = cache_type_map[base->eax.split.type]; + this_leaf->coherency_line_size = + base->ebx.split.coherency_line_size + 1; + this_leaf->ways_of_associativity = + base->ebx.split.ways_of_associativity + 1; + this_leaf->size = base->size; + this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; + this_leaf->physical_line_partition = + base->ebx.split.physical_line_partition + 1; + this_leaf->priv = base->nb; +} + +static int __init_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + if (!num_cache_leaves) + return -ENOENT; + if (!this_cpu_ci) + return -EINVAL; + this_cpu_ci->num_levels = 3; + this_cpu_ci->num_leaves = num_cache_leaves; + return 0; +} + +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + +static int __populate_cache_leaves(unsigned int cpu) +{ + unsigned int idx, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct _cpuid4_info_regs id4_regs = {}; + + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + if (ret) + return ret; + get_cache_id(cpu, &id4_regs); + ci_leaf_init(this_leaf++, &id4_regs); + __cache_cpumap_setup(cpu, idx, &id4_regs); + } + this_cpu_ci->cpu_map_populated = true; + + return 0; +} + +DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) +DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c deleted file mode 100644 index 54d04d574148..000000000000 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ /dev/null @@ -1,968 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Routines to identify caches on Intel CPU. - * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj : Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 -#define LVL_TRACE 5 - -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - -enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 -}; - -union _cpuid4_leaf_eax { - struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; - } split; - u32 full; -}; - -union _cpuid4_leaf_ebx { - struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; - } split; - u32 full; -}; - -union _cpuid4_leaf_ecx { - struct { - unsigned int number_of_sets:32; - } split; - u32 full; -}; - -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned int id; - unsigned long size; - struct amd_northbridge *nb; -}; - -static unsigned short num_cache_leaves; - -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. - - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ -union l1_cache { - struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; - }; - unsigned val; -}; - -union l2_cache { - struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; - }; - unsigned val; -}; - -union l3_cache { - struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; - }; - unsigned val; -}; - -static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ -}; - -static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; - -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; - -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) -{ - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; - union l2_cache l2; - union l3_cache l3; - union l1_cache *l1 = &l1d; - - eax->full = 0; - ebx->full = 0; - ecx->full = 0; - - cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); - cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - - switch (leaf) { - case 1: - l1 = &l1i; - case 0: - if (!l1->val) - return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; - break; - case 2: - if (!l2.val) - return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); - break; - case 3: - if (!l3.val) - return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; - if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; - } - break; - default: - return; - } - - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; - - - if (assoc == 0xffff) - eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; -} - -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) -{ - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; - - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; - - return 0; -} - -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) -{ - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) -{ - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; - - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); - - return &cache_private_group; -} - -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) -{ - int node; - - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; - - node = amd_get_nb_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); -} -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ - -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) -{ - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } - - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; -} - -static int find_num_cache_leaves(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD) - op = 0x8000001d; - else - op = 4; - - do { - ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ - cpuid_count(op, i, &eax, &ebx, &ecx, &edx); - cache_eax.full = eax; - } while (cache_eax.split.type != CTYPE_NULL); - return i; -} - -void init_amd_cacheinfo(struct cpuinfo_x86 *c) -{ - - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - num_cache_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } -} - -unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP - unsigned int cpu = c->cpu_index; -#endif - - if (c->cpuid_level > 3) { - static int is_initialized; - - if (is_initialized == 0) { - /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(c); - is_initialized++; - } - - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < num_cache_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; - - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; - - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } - } - } - /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache - */ - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (num_cache_leaves != 0 && c->x86 == 15) - only_trace = 1; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - case LVL_TRACE: - trace += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } - } - } - - if (new_l1d) - l1d = new_l1d; - - if (new_l1i) - l1i = new_l1i; - - if (new_l2) { - l2 = new_l2; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l2_id; -#endif - } - - if (new_l3) { - l3 = new_l3; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l3_id; -#endif - } - -#ifdef CONFIG_SMP - /* - * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->phys_proc_id. - */ - if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; -#endif - - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); - - return l2; -} - -static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) -{ - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf; - int i, sibling; - - /* - * For L3, always use the pre-calculated cpu_llc_shared_mask - * to derive shared_cpu_map. - */ - if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } - } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - unsigned int apicid, nshared, first, last; - - nshared = base->eax.split.num_threads_sharing + 1; - apicid = cpu_data(cpu).apicid; - first = apicid - (apicid % nshared); - last = first + nshared - 1; - - for_each_online_cpu(i) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - - apicid = cpu_data(i).apicid; - if ((apicid < first) || (apicid > last)) - continue; - - this_leaf = this_cpu_ci->info_list + index; - - for_each_online_cpu(sibling) { - apicid = cpu_data(sibling).apicid; - if ((apicid < first) || (apicid > last)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } - } else - return 0; - - return 1; -} - -static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) -{ - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; - unsigned long num_threads_sharing; - int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_vendor == X86_VENDOR_AMD) { - if (__cache_amd_cpumap_setup(cpu, index, base)) - return; - } - - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; - - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); - if (num_threads_sharing == 1) - return; - - index_msb = get_count_order(num_threads_sharing); - - for_each_online_cpu(i) - if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { - struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); - - if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); - } -} - -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) -{ - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; -} - -static int __init_cache_level(unsigned int cpu) -{ - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - - if (!num_cache_leaves) - return -ENOENT; - if (!this_cpu_ci) - return -EINVAL; - this_cpu_ci->num_levels = 3; - this_cpu_ci->num_leaves = num_cache_leaves; - return 0; -} - -/* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->apicid >> index_msb; -} - -static int __populate_cache_leaves(unsigned int cpu) -{ - unsigned int idx, ret; - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; - - for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); - if (ret) - return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); - } - this_cpu_ci->cpu_map_populated = true; - - return 0; -} - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) -- cgit From 68091ee7ac3c1a8786fe1bebbd616b14236efb99 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Fri, 27 Apr 2018 16:34:37 -0500 Subject: x86/CPU/AMD: Calculate last level cache ID from number of sharing threads Last Level Cache ID can be calculated from the number of threads sharing the cache, which is available from CPUID Fn0x8000001D (Cache Properties). This is used to left-shift the APIC ID to derive LLC ID. Therefore, default to this method unless the APIC ID enumeration does not follow the scheme. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1524864877-111962-5-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/amd.c | 19 +++---------------- arch/x86/kernel/cpu/cacheinfo.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a37a83809665..bf27246bb7bd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -343,22 +344,8 @@ static void amd_get_topology(struct cpuinfo_x86 *c) c->x86_max_cores /= smp_num_siblings; } - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (cpuid_edx(0x80000006)) { - if (c->x86 == 0x17) { - /* - * LLC is at the core complex level. - * Core complex id is ApicId[3]. - */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; - } else { - /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = node_id; - } - } + cacheinfo_amd_init_llc_id(c, cpu, node_id); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 54d04d574148..a2e03c9401a1 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -637,6 +637,45 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + if (c->x86 < 0x17) { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } else if (c->x86 == 0x17 && + c->x86_model >= 0 && c->x86_model <= 0x1F) { + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* + * LLC ID is calculated from the number of threads sharing the + * cache. + * */ + u32 eax, ebx, ecx, edx, num_sharing_cache = 0; + u32 llc_index = find_num_cache_leaves(c) - 1; + + cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); + if (eax) + num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + + if (num_sharing_cache) { + int bits = get_count_order(num_sharing_cache) - 1; + + per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + } + } +} + void init_amd_cacheinfo(struct cpuinfo_x86 *c) { -- cgit From 6c4f5abaf3566dbf5b26e7b14f4392be400f12e3 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Fri, 27 Apr 2018 16:48:00 -0500 Subject: x86/CPU: Modify detect_extended_topology() to return result Current implementation does not communicate whether it can successfully detect CPUID function 0xB information. Therefore, modify the function to return success or error codes. This will be used by subsequent patches. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Link: http://lkml.kernel.org/r/1524865681-112110-2-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/topology.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index b099024d339c..81c0afb39d0a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -27,7 +27,7 @@ * exists, use it for populating initial_apicid and cpu topology * detection. */ -void detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; @@ -36,7 +36,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c) static bool printed; if (c->cpuid_level < 0xb) - return; + return -1; cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); @@ -44,7 +44,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c) * check if the cpuid leaf 0xb is actually implemented. */ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return; + return -1; set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); @@ -95,6 +95,6 @@ void detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id); printed = 1; } - return; #endif + return 0; } -- cgit From 3986a0a805e668a63fac0ca2cdfa8db951f87c4b Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Fri, 27 Apr 2018 16:48:01 -0500 Subject: x86/CPU/AMD: Derive CPU topology from CPUID function 0xB when available Derive topology information from Extended Topology Enumeration (CPUID function 0xB) when the information is available. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1524865681-112110-3-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/amd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bf27246bb7bd..55361ee04cc5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -327,6 +327,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -344,6 +345,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) c->x86_max_cores /= smp_num_siblings; } + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + cacheinfo_amd_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { @@ -378,7 +387,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - amd_get_topology(c); } u16 amd_get_nb_id(int cpu) @@ -821,6 +829,7 @@ static void init_amd(struct cpuinfo_x86 *c) /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { amd_detect_cmp(c); + amd_get_topology(c); srat_detect_node(c); } -- cgit From 15b28bbcd567a9199481ecfef39702b258f9baff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Apr 2018 17:22:28 +0200 Subject: dma-debug: move initialization to common code Most mainstream architectures are using 65536 entries, so lets stick to that. If someone is really desperate to override it that can still be done through , but I'd rather see a really good rationale for that. dma_debug_init is now called as a core_initcall, which for many architectures means much earlier, and provides dma-debug functionality earlier in the boot process. This should be safe as it only relies on the memory allocator already being available. Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Reviewed-by: Robin Murphy --- arch/x86/kernel/pci-dma.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 77625b60a510..bcbaa2e8031e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -55,9 +55,6 @@ struct device x86_dma_fallback_dev = { }; EXPORT_SYMBOL(x86_dma_fallback_dev); -/* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES 65536 - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -189,7 +186,6 @@ EXPORT_SYMBOL(arch_dma_supported); static int __init pci_iommu_init(void) { struct iommu_table_entry *p; - dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); -- cgit From 27cca866e3fce0961e13b38b87e5322fe153e437 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Fri, 13 Apr 2018 23:55:07 +1000 Subject: mm/pkeys, x86, powerpc: Display pkey in smaps if arch supports pkeys Currently the architecture specific code is expected to display the protection keys in smap for a given vma. This can lead to redundant code and possibly to divergent formats in which the key gets displayed. This patch changes the implementation. It displays the pkey only if the architecture support pkeys, i.e arch_pkeys_enabled() returns true. x86 arch_show_smap() function is not needed anymore, delete it. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Ram Pai [mpe: Split out from larger patch, rebased on header changes] Signed-off-by: Michael Ellerman Reviewed-by: Dave Hansen --- arch/x86/kernel/setup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5c623dfe39d1..2f86d883dd95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1312,11 +1312,3 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); - -void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) -{ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return; - - seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); -} -- cgit From b5cf8707e6c9d85819b4bee3218ec560953149f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 May 2018 11:29:07 +0200 Subject: x86/CPU: Move cpu local function declarations to local header No point in exposing all these functions globaly as they are strict local to the cpu management code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/cacheinfo.c | 2 ++ arch/x86/kernel/cpu/cpu.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index a2e03c9401a1..58d472c84ba2 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -20,6 +20,8 @@ #include #include +#include "cpu.h" + #define LVL_1_INST 1 #define LVL_1_DATA 2 #define LVL_2 3 diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e806b11a99af..c415f99e9599 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,6 +47,15 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); + +extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); -- cgit From 2cc61be60e37b1856a97ccbdcca3e86e593bf06a Mon Sep 17 00:00:00 2001 From: David Wang Date: Thu, 3 May 2018 10:32:44 +0800 Subject: x86/CPU: Make intel_num_cpu_cores() generic intel_num_cpu_cores() is a static function in intel.c which can't be used by other files. Define another function called detect_num_cpu_cores() in common.c to replace this function so it can be reused. Signed-off-by: David Wang Signed-off-by: Thomas Gleixner Cc: lukelin@viacpu.com Cc: qiyuanwang@zhaoxin.com Cc: gregkh@linuxfoundation.org Cc: brucechang@via-alliance.com Cc: timguo@zhaoxin.com Cc: cooperyan@zhaoxin.com Cc: hpa@zytor.com Cc: benjaminpan@viatech.com Link: https://lkml.kernel.org/r/1525314766-18910-2-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/common.c | 14 ++++++++++++++ arch/x86/kernel/cpu/cpu.h | 1 + arch/x86/kernel/cpu/intel.c | 20 +------------------- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37c7c8334a00..6993842e788c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -584,6 +584,20 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } +int detect_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) + return 1; + + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + return (eax >> 26) + 1; + else + return 1; +} + void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c415f99e9599..efd6ef8ad14e 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -54,6 +54,7 @@ extern u32 get_scattered_cpuid_leaf(unsigned int level, extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); +extern int detect_num_cpu_cores(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b9693b80fc21..b54535be254a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -453,24 +453,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -/* - * find out the number of processor cores on the die - */ -static int intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return 1; - - /* Intel has a non-standard dependency on %ecx for this CPUID level. */ - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - return (eax >> 26) + 1; - else - return 1; -} - static void detect_vmx_virtcap(struct cpuinfo_x86 *c) { /* Intel VMX MSR indicated features */ @@ -671,7 +653,7 @@ static void init_intel(struct cpuinfo_x86 *c) * let's use the legacy cpuid vector 0x1 and 0x4 for topology * detection. */ - c->x86_max_cores = intel_num_cpu_cores(c); + c->x86_max_cores = detect_num_cpu_cores(c); #ifdef CONFIG_X86_32 detect_ht(c); #endif -- cgit From 2a7ffe465769bc15cb4a77f6a71e6f071d644068 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 9 May 2018 16:49:34 +0900 Subject: x86/build: Remove no-op macro VMLINUX_SYMBOL() VMLINUX_SYMBOL() is no-op unless CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX is defined. It has ever been selected only by BLACKFIN and METAG. VMLINUX_SYMBOL() is unneeded for x86-specific code. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: linux-arch Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/1525852174-29022-1-git-send-email-yamada.masahiro@socionext.com --- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 795f3a80e576..5e1458f609a1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -117,11 +117,11 @@ SECTIONS #ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); - VMLINUX_SYMBOL(__entry_trampoline_start) = .; + __entry_trampoline_start = .; _entry_trampoline = .; *(.entry_trampoline) . = ALIGN(PAGE_SIZE); - VMLINUX_SYMBOL(__entry_trampoline_end) = .; + __entry_trampoline_end = .; ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); #endif -- cgit From 807e9bc8e2fe6b4907f9f77fd073f7ef5073af29 Mon Sep 17 00:00:00 2001 From: David Wang Date: Thu, 3 May 2018 10:32:45 +0800 Subject: x86/CPU: Move cpu_detect_cache_sizes() into init_intel_cacheinfo() There is no point in having the conditional cpu_detect_cache_sizes() call at the callsite of init_intel_cacheinfo(). Move it into init_intel_cacheinfo() and make init_intel_cacheinfo() void. [ tglx: Made the init_intel_cacheinfo() void as the return value was pointless. Adjust changelog accordingly ] Signed-off-by: David Wang Signed-off-by: Thomas Gleixner Cc: lukelin@viacpu.com Cc: qiyuanwang@zhaoxin.com Cc: gregkh@linuxfoundation.org Cc: brucechang@via-alliance.com Cc: timguo@zhaoxin.com Cc: cooperyan@zhaoxin.com Cc: hpa@zytor.com Cc: benjaminpan@viatech.com Link: https://lkml.kernel.org/r/1525314766-18910-3-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/cacheinfo.c | 5 +++-- arch/x86/kernel/cpu/cpu.h | 2 +- arch/x86/kernel/cpu/intel.c | 14 ++++---------- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 58d472c84ba2..38354c66df81 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -691,7 +691,7 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) } } -unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) +void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -843,7 +843,8 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); - return l2; + if (!l2) + cpu_detect_cache_sizes(c); } static int __cache_amd_cpumap_setup(unsigned int cpu, int index, diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index efd6ef8ad14e..49bf8a080105 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -51,7 +51,7 @@ extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, enum cpuid_regs_idx reg); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern int detect_num_cpu_cores(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b54535be254a..ca141d159be1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -635,8 +635,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) static void init_intel(struct cpuinfo_x86 *c) { - unsigned int l2 = 0; - early_init_intel(c); intel_workarounds(c); @@ -659,13 +657,7 @@ static void init_intel(struct cpuinfo_x86 *c) #endif } - l2 = init_intel_cacheinfo(c); - - /* Detect legacy cache sizes if init_intel_cacheinfo did not */ - if (l2 == 0) { - cpu_detect_cache_sizes(c); - l2 = c->x86_cache_size; - } + init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -678,7 +670,8 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (boot_cpu_has(X86_FEATURE_DS)) { - unsigned int l1; + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) set_cpu_cap(c, X86_FEATURE_BTS); @@ -706,6 +699,7 @@ static void init_intel(struct cpuinfo_x86 *c) * Dixon is NOT a Celeron. */ if (c->x86 == 6) { + unsigned int l2 = c->x86_cache_size; char *p = NULL; switch (c->x86_model) { -- cgit From a2aa578fec8c29436bce5e6c15e1e31729d539a3 Mon Sep 17 00:00:00 2001 From: David Wang Date: Thu, 3 May 2018 10:32:46 +0800 Subject: x86/Centaur: Report correct CPU/cache topology Centaur CPUs enumerate the cache topology in the same way as Intel CPUs, but the function is unused so for. The Centaur init code also misses to initialize x86_info::max_cores, so the CPU topology can't be described correctly. Initialize x86_info::max_cores and invoke init_cacheinfo() to make CPU and cache topology information available and correct. Signed-off-by: David Wang Signed-off-by: Thomas Gleixner Cc: lukelin@viacpu.com Cc: qiyuanwang@zhaoxin.com Cc: gregkh@linuxfoundation.org Cc: brucechang@via-alliance.com Cc: timguo@zhaoxin.com Cc: cooperyan@zhaoxin.com Cc: hpa@zytor.com Cc: benjaminpan@viatech.com Link: https://lkml.kernel.org/r/1525314766-18910-4-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/centaur.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 80d5110481ec..c265494234e6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -160,6 +160,11 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, 0*32+31); #endif early_init_centaur(c); + init_intel_cacheinfo(c); + c->x86_max_cores = detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); -- cgit From 9305bd6ca7b40fece04d7a7a02765e9e8349f146 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 May 2018 11:43:53 +0200 Subject: x86/CPU: Move x86_cpuinfo::x86_max_cores assignment to detect_num_cpu_cores() No point to have it at the call sites. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/common.c | 9 ++++----- arch/x86/kernel/cpu/cpu.h | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c265494234e6..14433ff5b828 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -161,7 +161,7 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif early_init_centaur(c); init_intel_cacheinfo(c); - c->x86_max_cores = detect_num_cpu_cores(c); + detect_num_cpu_cores(c); #ifdef CONFIG_X86_32 detect_ht(c); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6993842e788c..f2085d54c2c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -584,18 +584,17 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } -int detect_num_cpu_cores(struct cpuinfo_x86 *c) +void detect_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; + c->x86_max_cores = 1; if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return 1; + return; cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return (eax >> 26) + 1; - else - return 1; + c->x86_max_cores = (eax >> 26) + 1; } void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 49bf8a080105..295cb00a5ac5 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -54,7 +54,7 @@ extern u32 get_scattered_cpuid_leaf(unsigned int level, extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); -extern int detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ca141d159be1..6c414e2f3f5c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -651,7 +651,7 @@ static void init_intel(struct cpuinfo_x86 *c) * let's use the legacy cpuid vector 0x1 and 0x4 for topology * detection. */ - c->x86_max_cores = detect_num_cpu_cores(c); + detect_num_cpu_cores(c); #ifdef CONFIG_X86_32 detect_ht(c); #endif -- cgit From a7a3153a98d581196ce092e0b83cac2c4ee1fd1f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 May 2018 08:15:45 -0700 Subject: x86/early-quirks: Rename duplicate define of dev_err dev_err is becoming a macro calling _dev_err to allow prefixing of dev_fmt to any dev_ use that has a #define dev_fmt(fmt) similar to the existing #define pr_fmt(fmt) uses. Remove this dev_err macro and convert the existing two uses to pr_err. This allows clean compilation in the patch that introduces dev_fmt which can prefix dev_ logging macros with arbitrary content similar to the #define pr_fmt macro. Signed-off-by: Joe Perches Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/8fb4b2a77d50e21ae1f7e4e267e68691efe2c270.1525878372.git.joe@perches.com --- arch/x86/kernel/early-quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bae0d32e327b..da5d8ac60062 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -28,8 +28,6 @@ #include #include -#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) - static void __init fix_hypertransport_config(int num, int slot, int func) { u32 htcfg; @@ -617,7 +615,8 @@ static void __init apple_airport_reset(int bus, int slot, int func) pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { - dev_err("Cannot power up Apple AirPort card\n"); + pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n", + bus, slot, func); return; } } @@ -628,7 +627,8 @@ static void __init apple_airport_reset(int bus, int slot, int func) mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); if (!mmio) { - dev_err("Cannot iomap Apple AirPort card\n"); + pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n", + bus, slot, func); return; } -- cgit From 1de392f5d5e803663abbd8ed084233f154152bcd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 10 May 2018 08:45:30 -0700 Subject: x86: Remove pr_fmt duplicate logging prefixes Converting pr_fmt from a default simple #define to use KBUILD_MODNAME added some duplicate prefixes. Remove the duplicate prefixes. Signed-off-by: Joe Perches Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/e7b709a2b040af7faa81b0aa2c3a125aed628a82.1525964383.git.joe@perches.com --- arch/x86/kernel/e820.c | 32 +++++++++++++++++--------------- arch/x86/kernel/hpet.c | 5 ++--- arch/x86/kernel/uprobes.c | 4 ++-- 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6a2cb1442e05..d1f25c831447 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -155,7 +155,8 @@ static void __init __e820__range_add(struct e820_table *table, u64 start, u64 si int x = table->nr_entries; if (x >= ARRAY_SIZE(table->entries)) { - pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1); + pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", + start, start + size - 1); return; } @@ -190,9 +191,10 @@ void __init e820__print_table(char *who) int i; for (i = 0; i < e820_table->nr_entries; i++) { - pr_info("%s: [mem %#018Lx-%#018Lx] ", who, - e820_table->entries[i].addr, - e820_table->entries[i].addr + e820_table->entries[i].size - 1); + pr_info("%s: [mem %#018Lx-%#018Lx] ", + who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); e820_print_type(e820_table->entries[i].type); pr_cont("\n"); @@ -574,7 +576,7 @@ void __init e820__update_table_print(void) if (e820__update_table(e820_table)) return; - pr_info("e820: modified physical RAM map:\n"); + pr_info("modified physical RAM map:\n"); e820__print_table("modified"); } @@ -636,9 +638,8 @@ __init void e820__setup_pci_gap(void) if (!found) { #ifdef CONFIG_X86_64 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; - pr_err( - "e820: Cannot find an available gap in the 32-bit address range\n" - "e820: PCI devices with unassigned 32-bit BARs may not work!\n"); + pr_err("Cannot find an available gap in the 32-bit address range\n"); + pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else gapstart = 0x10000000; #endif @@ -649,7 +650,8 @@ __init void e820__setup_pci_gap(void) */ pci_mem_start = gapstart; - pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1); + pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); } /* @@ -711,7 +713,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); early_memunmap(sdata, data_len); - pr_info("e820: extended physical RAM map:\n"); + pr_info("extended physical RAM map:\n"); e820__print_table("extended"); } @@ -780,7 +782,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); - pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -830,8 +832,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n", - last_pfn, max_arch_pfn); + pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); return last_pfn; } @@ -1005,7 +1007,7 @@ void __init e820__finish_early_params(void) if (e820__update_table(e820_table) < 0) early_panic("Invalid user supplied memory map"); - pr_info("e820: user-defined physical RAM map:\n"); + pr_info("user-defined physical RAM map:\n"); e820__print_table("user"); } } @@ -1238,7 +1240,7 @@ void __init e820__memory_setup(void) memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); - pr_info("e820: BIOS-provided physical RAM map:\n"); + pr_info("BIOS-provided physical RAM map:\n"); e820__print_table(who); } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 8ce4212e2b8d..b6be34ee88e9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -975,8 +975,7 @@ int __init hpet_enable(void) cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); if (cfg) - pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", - cfg); + pr_warn("Unrecognized bits %#x set in global cfg\n", cfg); for (i = 0; i <= last; ++i) { cfg = hpet_readl(HPET_Tn_CFG(i)); @@ -988,7 +987,7 @@ int __init hpet_enable(void) | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | HPET_TN_FSB | HPET_TN_FSB_CAP); if (cfg) - pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", + pr_warn("Unrecognized bits %#x set in cfg#%u\n", cfg, i); } hpet_print_config(); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 85c7ef23d99f..d1c468741915 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1079,8 +1079,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; if (nleft != rasize) { - pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " - "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); + pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", + current->pid, regs->sp, regs->ip); force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); } -- cgit From e6d8c84a58380030457759ad6085f3792a76b2ce Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 10 May 2018 08:45:31 -0700 Subject: x86/mtrr: Rename main.c to mtrr.c and remove duplicate prefixes Kbuild uses the first file as the name for KBUILD_MODNAME. mtrr uses main.c as its first file, so rename that file to mtrr.c and fixup the Makefile. Remove the now duplicate "mtrr: " prefixes from the logging calls. Signed-off-by: Joe Perches Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/ae1fa81a0d1fad87571967b91ea90f70f486e853.1525964384.git.joe@perches.com --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 887 -------------------------------------- arch/x86/kernel/cpu/mtrr/mtrr.c | 886 +++++++++++++++++++++++++++++++++++++ 3 files changed, 887 insertions(+), 888 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mtrr/main.c create mode 100644 arch/x86/kernel/cpu/mtrr/mtrr.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index ad9e5ed81181..2ad9107ee980 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o cleanup.o +obj-y := mtrr.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c deleted file mode 100644 index 7468de429087..000000000000 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ /dev/null @@ -1,887 +0,0 @@ -/* Generic MTRR (Memory Type Range Register) driver. - - Copyright (C) 1997-2000 Richard Gooch - Copyright (c) 2002 Patrick Mochel - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - Richard Gooch may be reached by email at rgooch@atnf.csiro.au - The postal address is: - Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. - - Source: "Pentium Pro Family Developer's Manual, Volume 3: - Operating System Writer's Guide" (Intel document number 242692), - section 11.11.7 - - This was cleaned and made readable by Patrick Mochel - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: - System Programming Guide; Section 9.11. (1997 edition - PPro). -*/ - -#define DEBUG - -#include /* FIXME: kvm_para.h needs this */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "mtrr.h" - -/* arch_phys_wc_add returns an MTRR register index plus this offset. */ -#define MTRR_TO_PHYS_WC_OFFSET 1000 - -u32 num_var_ranges; -static bool __mtrr_enabled; - -static bool mtrr_enabled(void) -{ - return __mtrr_enabled; -} - -unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; -static DEFINE_MUTEX(mtrr_mutex); - -u64 size_or_mask, size_and_mask; -static bool mtrr_aps_delayed_init; - -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; - -const struct mtrr_ops *mtrr_if; - -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); - -void __init set_mtrr_ops(const struct mtrr_ops *ops) -{ - if (ops->vendor && ops->vendor < X86_VENDOR_NUM) - mtrr_ops[ops->vendor] = ops; -} - -/* Returns non-zero if we have the write-combining memory type */ -static int have_wrcomb(void) -{ - struct pci_dev *dev; - - dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); - if (dev != NULL) { - /* - * ServerWorks LE chipsets < rev 6 have problems with - * write-combining. Don't allow it and leave room for other - * chipsets to be tagged - */ - if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && - dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && - dev->revision <= 5) { - pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } - /* - * Intel 450NX errata # 23. Non ascending cacheline evictions to - * write combining memory may resulting in data corruption - */ - if (dev->vendor == PCI_VENDOR_ID_INTEL && - dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } - pci_dev_put(dev); - } - return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; -} - -/* This function returns the number of variable MTRRs */ -static void __init set_num_var_ranges(void) -{ - unsigned long config = 0, dummy; - - if (use_intel()) - rdmsr(MSR_MTRRcap, config, dummy); - else if (is_cpu(AMD)) - config = 2; - else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) - config = 8; - - num_var_ranges = config & 0xff; -} - -static void __init init_table(void) -{ - int i, max; - - max = num_var_ranges; - for (i = 0; i < max; i++) - mtrr_usage_table[i] = 1; -} - -struct set_mtrr_data { - unsigned long smp_base; - unsigned long smp_size; - unsigned int smp_reg; - mtrr_type smp_type; -}; - -/** - * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed - * by all the CPUs. - * @info: pointer to mtrr configuration data - * - * Returns nothing. - */ -static int mtrr_rendezvous_handler(void *info) -{ - struct set_mtrr_data *data = info; - - /* - * We use this same function to initialize the mtrrs during boot, - * resume, runtime cpu online and on an explicit request to set a - * specific MTRR. - * - * During boot or suspend, the state of the boot cpu's mtrrs has been - * saved, and we want to replicate that across all the cpus that come - * online (either at the end of boot or resume or during a runtime cpu - * online). If we're doing that, @reg is set to something special and on - * all the cpu's we do mtrr_if->set_all() (On the logical cpu that - * started the boot/resume sequence, this might be a duplicate - * set_all()). - */ - if (data->smp_reg != ~0U) { - mtrr_if->set(data->smp_reg, data->smp_base, - data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { - mtrr_if->set_all(); - } - return 0; -} - -static inline int types_compatible(mtrr_type type1, mtrr_type type2) -{ - return type1 == MTRR_TYPE_UNCACHABLE || - type2 == MTRR_TYPE_UNCACHABLE || - (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || - (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); -} - -/** - * set_mtrr - update mtrrs on all processors - * @reg: mtrr in question - * @base: mtrr base - * @size: mtrr size - * @type: mtrr type - * - * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * - * 1. Queue work to do the following on all processors: - * 2. Disable Interrupts - * 3. Wait for all procs to do so - * 4. Enter no-fill cache mode - * 5. Flush caches - * 6. Clear PGE bit - * 7. Flush all TLBs - * 8. Disable all range registers - * 9. Update the MTRRs - * 10. Enable all range registers - * 11. Flush all TLBs and caches again - * 12. Enter normal cache mode and reenable caching - * 13. Set PGE - * 14. Wait for buddies to catch up - * 15. Enable interrupts. - * - * What does that mean for us? Well, stop_machine() will ensure that - * the rendezvous handler is started on each CPU. And in lockstep they - * do the state transition of disabling interrupts, updating MTRR's - * (the CPU vendors may each do it differently, so we call mtrr_if->set() - * callback and let them take care of it.) and enabling interrupts. - * - * Note that the mechanism is the same for UP systems, too; all the SMP stuff - * becomes nops. - */ -static void -set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); -} - -static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); -} - -static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data = { .smp_reg = reg, - .smp_base = base, - .smp_size = size, - .smp_type = type - }; - - stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, - cpu_callout_mask); -} - -/** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ -int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, bool increment) -{ - unsigned long lbase, lsize; - int i, replace, error; - mtrr_type ltype; - - if (!mtrr_enabled()) - return -ENXIO; - - error = mtrr_if->validate_add_page(base, size, type); - if (error) - return error; - - if (type >= MTRR_NUM_TYPES) { - pr_warn("mtrr: type: %u invalid\n", type); - return -EINVAL; - } - - /* If the type is WC, check that this processor supports it */ - if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - pr_warn("mtrr: your processor doesn't support write-combining\n"); - return -ENOSYS; - } - - if (!size) { - pr_warn("mtrr: zero sized request\n"); - return -EINVAL; - } - - if ((base | (base + size - 1)) >> - (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { - pr_warn("mtrr: base or size exceeds the MTRR width\n"); - return -EINVAL; - } - - error = -EINVAL; - replace = -1; - - /* No CPU hotplug when we change MTRR entries */ - get_online_cpus(); - - /* Search for existing MTRR */ - mutex_lock(&mtrr_mutex); - for (i = 0; i < num_var_ranges; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || - base + size - 1 < lbase) - continue; - /* - * At this point we know there is some kind of - * overlap/enclosure - */ - if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && - base + size - 1 >= lbase + lsize - 1) { - /* New region encloses an existing region */ - if (type == ltype) { - replace = replace == -1 ? i : -2; - continue; - } else if (types_compatible(type, ltype)) - continue; - } - pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); - goto out; - } - /* New region is enclosed by an existing region */ - if (ltype != type) { - if (types_compatible(type, ltype)) - continue; - pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); - goto out; - } - if (increment) - ++mtrr_usage_table[i]; - error = i; - goto out; - } - /* Search for an empty MTRR */ - i = mtrr_if->get_free_region(base, size, replace); - if (i >= 0) { - set_mtrr_cpuslocked(i, base, size, type); - if (likely(replace < 0)) { - mtrr_usage_table[i] = 1; - } else { - mtrr_usage_table[i] = mtrr_usage_table[replace]; - if (increment) - mtrr_usage_table[i]++; - if (unlikely(replace != i)) { - set_mtrr_cpuslocked(replace, 0, 0, 0); - mtrr_usage_table[replace] = 0; - } - } - } else { - pr_info("mtrr: no more MTRRs available\n"); - } - error = i; - out: - mutex_unlock(&mtrr_mutex); - put_online_cpus(); - return error; -} - -static int mtrr_check(unsigned long base, unsigned long size) -{ - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - pr_warn("mtrr: size and base must be multiples of 4 kiB\n"); - pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); - dump_stack(); - return -1; - } - return 0; -} - -/** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ -int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) -{ - if (!mtrr_enabled()) - return -ENODEV; - if (mtrr_check(base, size)) - return -EINVAL; - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, - increment); -} - -/** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ -int mtrr_del_page(int reg, unsigned long base, unsigned long size) -{ - int i, max; - mtrr_type ltype; - unsigned long lbase, lsize; - int error = -EINVAL; - - if (!mtrr_enabled()) - return -ENODEV; - - max = num_var_ranges; - /* No CPU hotplug when we change MTRR entries */ - get_online_cpus(); - mutex_lock(&mtrr_mutex); - if (reg < 0) { - /* Search for existing MTRR */ - for (i = 0; i < max; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); - if (lbase == base && lsize == size) { - reg = i; - break; - } - } - if (reg < 0) { - pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", - base, size); - goto out; - } - } - if (reg >= max) { - pr_warn("mtrr: register: %d too big\n", reg); - goto out; - } - mtrr_if->get(reg, &lbase, &lsize, <ype); - if (lsize < 1) { - pr_warn("mtrr: MTRR %d not used\n", reg); - goto out; - } - if (mtrr_usage_table[reg] < 1) { - pr_warn("mtrr: reg: %d has count=0\n", reg); - goto out; - } - if (--mtrr_usage_table[reg] < 1) - set_mtrr_cpuslocked(reg, 0, 0, 0); - error = reg; - out: - mutex_unlock(&mtrr_mutex); - put_online_cpus(); - return error; -} - -/** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ -int mtrr_del(int reg, unsigned long base, unsigned long size) -{ - if (!mtrr_enabled()) - return -ENODEV; - if (mtrr_check(base, size)) - return -EINVAL; - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); -} - -/** - * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable - * @base: Physical base address - * @size: Size of region - * - * If PAT is available, this does nothing. If PAT is unavailable, it - * attempts to add a WC MTRR covering size bytes starting at base and - * logs an error if this fails. - * - * The called should provide a power of two size on an equivalent - * power of two boundary. - * - * Drivers must store the return value to pass to mtrr_del_wc_if_needed, - * but drivers should not try to interpret that return value. - */ -int arch_phys_wc_add(unsigned long base, unsigned long size) -{ - int ret; - - if (pat_enabled() || !mtrr_enabled()) - return 0; /* Success! (We don't need to do anything.) */ - - ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); - if (ret < 0) { - pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", - (void *)base, (void *)(base + size - 1)); - return ret; - } - return ret + MTRR_TO_PHYS_WC_OFFSET; -} -EXPORT_SYMBOL(arch_phys_wc_add); - -/* - * arch_phys_wc_del - undoes arch_phys_wc_add - * @handle: Return value from arch_phys_wc_add - * - * This cleans up after mtrr_add_wc_if_needed. - * - * The API guarantees that mtrr_del_wc_if_needed(error code) and - * mtrr_del_wc_if_needed(0) do nothing. - */ -void arch_phys_wc_del(int handle) -{ - if (handle >= 1) { - WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); - mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); - } -} -EXPORT_SYMBOL(arch_phys_wc_del); - -/* - * arch_phys_wc_index - translates arch_phys_wc_add's return value - * @handle: Return value from arch_phys_wc_add - * - * This will turn the return value from arch_phys_wc_add into an mtrr - * index suitable for debugging. - * - * Note: There is no legitimate use for this function, except possibly - * in printk line. Alas there is an illegitimate use in some ancient - * drm ioctls. - */ -int arch_phys_wc_index(int handle) -{ - if (handle < MTRR_TO_PHYS_WC_OFFSET) - return -1; - else - return handle - MTRR_TO_PHYS_WC_OFFSET; -} -EXPORT_SYMBOL_GPL(arch_phys_wc_index); - -/* - * HACK ALERT! - * These should be called implicitly, but we can't yet until all the initcall - * stuff is done... - */ -static void __init init_ifs(void) -{ -#ifndef CONFIG_X86_64 - amd_init_mtrr(); - cyrix_init_mtrr(); - centaur_init_mtrr(); -#endif -} - -/* The suspend/resume methods are only for CPU without MTRR. CPU using generic - * MTRR driver doesn't require this - */ -struct mtrr_value { - mtrr_type ltype; - unsigned long lbase; - unsigned long lsize; -}; - -static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; - -static int mtrr_save(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); - } - return 0; -} - -static void mtrr_restore(void) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) { - set_mtrr(i, mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); - } - } -} - - - -static struct syscore_ops mtrr_syscore_ops = { - .suspend = mtrr_save, - .resume = mtrr_restore, -}; - -int __initdata changed_by_mtrr_cleanup; - -#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) -/** - * mtrr_bp_init - initialize mtrrs on the boot CPU - * - * This needs to be called early; before any of the other CPUs are - * initialized (i.e. before smp_init()). - * - */ -void __init mtrr_bp_init(void) -{ - u32 phys_addr; - - init_ifs(); - - phys_addr = 32; - - if (boot_cpu_has(X86_FEATURE_MTRR)) { - mtrr_if = &generic_mtrr_ops; - size_or_mask = SIZE_OR_MASK_BITS(36); - size_and_mask = 0x00f00000; - phys_addr = 36; - - /* - * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it too when they extend the address - * bus of the Xeon. - */ - if (cpuid_eax(0x80000000) >= 0x80000008) { - phys_addr = cpuid_eax(0x80000008) & 0xff; - /* CPUID workaround for Intel 0F33/0F34 CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 0xF && - boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_stepping == 0x3 || - boot_cpu_data.x86_stepping == 0x4)) - phys_addr = 36; - - size_or_mask = SIZE_OR_MASK_BITS(phys_addr); - size_and_mask = ~size_or_mask & 0xfffff00000ULL; - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && - boot_cpu_data.x86 == 6) { - /* - * VIA C* family have Intel style MTRRs, - * but don't support PAE - */ - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - phys_addr = 32; - } - } else { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = mtrr_ops[X86_VENDOR_AMD]; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CENTAUR: - if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - case X86_VENDOR_CYRIX: - if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { - mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; - size_or_mask = SIZE_OR_MASK_BITS(32); - size_and_mask = 0; - } - break; - default: - break; - } - } - - if (mtrr_if) { - __mtrr_enabled = true; - set_num_var_ranges(); - init_table(); - if (use_intel()) { - /* BIOS may override */ - __mtrr_enabled = get_mtrr_state(); - - if (mtrr_enabled()) - mtrr_bp_pat_init(); - - if (mtrr_cleanup(phys_addr)) { - changed_by_mtrr_cleanup = 1; - mtrr_if->set_all(); - } - } - } - - if (!mtrr_enabled()) { - pr_info("MTRR: Disabled\n"); - - /* - * PAT initialization relies on MTRR's rendezvous handler. - * Skip PAT init until the handler can initialize both - * features independently. - */ - pat_disable("MTRRs disabled, skipping PAT initialization too."); - } -} - -void mtrr_ap_init(void) -{ - if (!mtrr_enabled()) - return; - - if (!use_intel() || mtrr_aps_delayed_init) - return; - /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries - * changed, but this routine will be called in cpu boot time, - * holding the lock breaks it. - * - * This routine is called in two cases: - * - * 1. very earily time of software resume, when there absolutely - * isn't mtrr entry changes; - * - * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug - * lock to prevent mtrr entry changes - */ - set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); -} - -/** - * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. - */ -void mtrr_save_state(void) -{ - int first_cpu; - - if (!mtrr_enabled()) - return; - - first_cpu = cpumask_first(cpu_online_mask); - smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); -} - -void set_mtrr_aps_delayed_init(void) -{ - if (!mtrr_enabled()) - return; - if (!use_intel()) - return; - - mtrr_aps_delayed_init = true; -} - -/* - * Delayed MTRR initialization for all AP's - */ -void mtrr_aps_init(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - /* - * Check if someone has requested the delay of AP MTRR initialization, - * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, - * then we are done. - */ - if (!mtrr_aps_delayed_init) - return; - - set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = false; -} - -void mtrr_bp_restore(void) -{ - if (!use_intel() || !mtrr_enabled()) - return; - - mtrr_if->set_all(); -} - -static int __init mtrr_init_finialize(void) -{ - if (!mtrr_enabled()) - return 0; - - if (use_intel()) { - if (!changed_by_mtrr_cleanup) - mtrr_state_warn(); - return 0; - } - - /* - * The CPU has no MTRR and seems to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * - * TBD: is there any system with such CPU which supports - * suspend/resume? If no, we should remove the code. - */ - register_syscore_ops(&mtrr_syscore_ops); - - return 0; -} -subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c new file mode 100644 index 000000000000..0c4f4fba9ec1 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -0,0 +1,886 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#define DEBUG + +#include /* FIXME: kvm_para.h needs this */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mtrr.h" + +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + +u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} + +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; +static DEFINE_MUTEX(mtrr_mutex); + +u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; + +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; + +const struct mtrr_ops *mtrr_if; + +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +void __init set_mtrr_ops(const struct mtrr_ops *ops) +{ + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +} + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev; + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ + if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ + if (dev->vendor == PCI_VENDOR_ID_INTEL && + dev->device == PCI_DEVICE_ID_INTEL_82451NX) { + pr_info("Intel 450NX MMC detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + pci_dev_put(dev); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; +} + +/* This function returns the number of variable MTRRs */ +static void __init set_num_var_ranges(void) +{ + unsigned long config = 0, dummy; + + if (use_intel()) + rdmsr(MSR_MTRRcap, config, dummy); + else if (is_cpu(AMD)) + config = 2; + else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) + config = 8; + + num_var_ranges = config & 0xff; +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + for (i = 0; i < max; i++) + mtrr_usage_table[i] = 1; +} + +struct set_mtrr_data { + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +/** + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. + * @info: pointer to mtrr configuration data + * + * Returns nothing. + */ +static int mtrr_rendezvous_handler(void *info) +{ + struct set_mtrr_data *data = info; + + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { + mtrr_if->set_all(); + } + return 0; +} + +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Queue work to do the following on all processors: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + unsigned long lbase, lsize; + int i, replace, error; + mtrr_type ltype; + + if (!mtrr_enabled()) + return -ENXIO; + + error = mtrr_if->validate_add_page(base, size, type); + if (error) + return error; + + if (type >= MTRR_NUM_TYPES) { + pr_warn("type: %u invalid\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + pr_warn("your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (!size) { + pr_warn("zero sized request\n"); + return -EINVAL; + } + + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { + pr_warn("base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + replace = -1; + + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + + /* Search for existing MTRR */ + mutex_lock(&mtrr_mutex); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) + continue; + /* + * At this point we know there is some kind of + * overlap/enclosure + */ + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } else if (types_compatible(type, ltype)) + continue; + } + pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (types_compatible(type, ltype)) + continue; + pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); + goto out; + } + if (increment) + ++mtrr_usage_table[i]; + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size, replace); + if (i >= 0) { + set_mtrr_cpuslocked(i, base, size, type); + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; + if (unlikely(replace != i)) { + set_mtrr_cpuslocked(replace, 0, 0, 0); + mtrr_usage_table[replace] = 0; + } + } + } else { + pr_info("no more MTRRs available\n"); + } + error = i; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + pr_warn("size and base must be multiples of 4 kiB\n"); + pr_debug("size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + + if (!mtrr_enabled()) + return -ENODEV; + + max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + mutex_lock(&mtrr_mutex); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + pr_debug("no MTRR for %lx000,%lx000 found\n", + base, size); + goto out; + } + } + if (reg >= max) { + pr_warn("register: %d too big\n", reg); + goto out; + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + pr_warn("MTRR %d not used\n", reg); + goto out; + } + if (mtrr_usage_table[reg] < 1) { + pr_warn("reg: %d has count=0\n", reg); + goto out; + } + if (--mtrr_usage_table[reg] < 1) + set_mtrr_cpuslocked(reg, 0, 0, 0); + error = reg; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled() || !mtrr_enabled()) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * arch_phys_wc_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int arch_phys_wc_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(arch_phys_wc_index); + +/* + * HACK ALERT! + * These should be called implicitly, but we can't yet until all the initcall + * stuff is done... + */ +static void __init init_ifs(void) +{ +#ifndef CONFIG_X86_64 + amd_init_mtrr(); + cyrix_init_mtrr(); + centaur_init_mtrr(); +#endif +} + +/* The suspend/resume methods are only for CPU without MTRR. CPU using generic + * MTRR driver doesn't require this + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; + +static int mtrr_save(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + + + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +int __initdata changed_by_mtrr_cleanup; + +#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) +/** + * mtrr_bp_init - initialize mtrrs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + * + */ +void __init mtrr_bp_init(void) +{ + u32 phys_addr; + + init_ifs(); + + phys_addr = 32; + + if (boot_cpu_has(X86_FEATURE_MTRR)) { + mtrr_if = &generic_mtrr_ops; + size_or_mask = SIZE_OR_MASK_BITS(36); + size_and_mask = 0x00f00000; + phys_addr = 36; + + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it too when they extend the address + * bus of the Xeon. + */ + if (cpuid_eax(0x80000000) >= 0x80000008) { + phys_addr = cpuid_eax(0x80000008) & 0xff; + /* CPUID workaround for Intel 0F33/0F34 CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 0xF && + boot_cpu_data.x86_model == 0x3 && + (boot_cpu_data.x86_stepping == 0x3 || + boot_cpu_data.x86_stepping == 0x4)) + phys_addr = 36; + + size_or_mask = SIZE_OR_MASK_BITS(phys_addr); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && + boot_cpu_data.x86 == 6) { + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + phys_addr = 32; + } + } else { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { + /* Pre-Athlon (K6) AMD CPU MTRRs */ + mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CENTAUR: + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { + mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CYRIX: + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { + mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + default: + break; + } + } + + if (mtrr_if) { + __mtrr_enabled = true; + set_num_var_ranges(); + init_table(); + if (use_intel()) { + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); + + if (mtrr_enabled()) + mtrr_bp_pat_init(); + + if (mtrr_cleanup(phys_addr)) { + changed_by_mtrr_cleanup = 1; + mtrr_if->set_all(); + } + } + } + + if (!mtrr_enabled()) { + pr_info("Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } +} + +void mtrr_ap_init(void) +{ + if (!mtrr_enabled()) + return; + + if (!use_intel() || mtrr_aps_delayed_init) + return; + /* + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes + */ + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); +} + +/** + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + */ +void mtrr_save_state(void) +{ + int first_cpu; + + if (!mtrr_enabled()) + return; + + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); +} + +void set_mtrr_aps_delayed_init(void) +{ + if (!mtrr_enabled()) + return; + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * Delayed MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel() || !mtrr_enabled()) + return; + + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel() || !mtrr_enabled()) + return; + + mtrr_if->set_all(); +} + +static int __init mtrr_init_finialize(void) +{ + if (!mtrr_enabled()) + return 0; + + if (use_intel()) { + if (!changed_by_mtrr_cleanup) + mtrr_state_warn(); + return 0; + } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); + + return 0; +} +subsys_initcall(mtrr_init_finialize); -- cgit From 3f3942aca6da351a12543aa776467791b63b3a78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 May 2018 15:57:23 +0200 Subject: proc: introduce proc_create_single{,_data} Variants of proc_create{,_data} that directly take a seq_file show callback and drastically reduces the boilerplate code in the callers. All trivial callers converted over. Signed-off-by: Christoph Hellwig --- arch/x86/kernel/apm_32.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index dfcbe6924eaf..cadeafabf167 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1715,19 +1715,6 @@ static int proc_apm_show(struct seq_file *m, void *v) return 0; } -static int proc_apm_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_apm_show, NULL); -} - -static const struct file_operations apm_file_ops = { - .owner = THIS_MODULE, - .open = proc_apm_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int apm(void *unused) { unsigned short bx; @@ -2360,7 +2347,7 @@ static int __init apm_init(void) set_desc_base(&gdt[APM_DS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); - proc_create("apm", 0, NULL, &apm_file_ops); + proc_create_single("apm", 0, NULL, proc_apm_show); kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { -- cgit From ad3fe525b9507d8d750d60e8e5dd8e0c0836fb99 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 18 May 2018 13:35:23 +0300 Subject: x86/mm: Unify pgtable_l5_enabled usage in early boot code Usually pgtable_l5_enabled is defined using cpu_feature_enabled(). cpu_feature_enabled() is not available in early boot code. We use several different preprocessor tricks to get around it. It's messy. Unify them all. If cpu_feature_enabled() is not yet available, USE_EARLY_PGTABLE_L5 can be defined before all includes. It makes pgtable_l5_enabled rely on __pgtable_l5_enabled variable instead. This approach fits all early users. Signed-off-by: Kirill A. Shutemov Reviewed-by: Thomas Gleixner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180518103528.59260-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d29e47c056e..494fea1dbd6e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -6,6 +6,10 @@ */ #define DISABLE_BRANCH_PROFILING + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + #include #include #include @@ -32,11 +36,6 @@ #include #include -#ifdef CONFIG_X86_5LEVEL -#undef pgtable_l5_enabled -#define pgtable_l5_enabled __pgtable_l5_enabled -#endif - /* * Manage page tables very early on. */ @@ -46,7 +45,6 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled __ro_after_init; -EXPORT_SYMBOL(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; @@ -88,7 +86,7 @@ static bool __head check_la57_support(unsigned long physaddr) if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) return false; - *fixup_int(&pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; *fixup_int(&pgdir_shift, physaddr) = 48; *fixup_int(&ptrs_per_p4d, physaddr) = 512; *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; -- cgit From ed7588d5dc6f5e7202fb9bbeb14d94706ba225d7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 18 May 2018 13:35:24 +0300 Subject: x86/mm: Stop pretending pgtable_l5_enabled is a variable pgtable_l5_enabled is defined using cpu_feature_enabled() but we refer to it as a variable. This is misleading. Make pgtable_l5_enabled() a function. We cannot literally define it as a function due to circular dependencies between header files. Function-alike macros is close enough. Signed-off-by: Kirill A. Shutemov Reviewed-by: Thomas Gleixner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180518103528.59260-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/machine_kexec_64.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 494fea1dbd6e..8d372d1c266d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -279,7 +279,7 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (!pgtable_l5_enabled) + if (!pgtable_l5_enabled()) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6010449ca6d2..4c8acdfdc5a7 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -354,7 +354,8 @@ void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); - VMCOREINFO_NUMBER(pgtable_l5_enabled); + vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", + pgtable_l5_enabled()); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); -- cgit From 372fddf709041743a93e381556f4c41aad1e28f8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 18 May 2018 13:35:25 +0300 Subject: x86/mm: Introduce the 'no5lvl' kernel parameter This kernel parameter allows to force kernel to use 4-level paging even if hardware and kernel support 5-level paging. The option may be useful to work around regressions related to 5-level paging. Signed-off-by: Kirill A. Shutemov Reviewed-by: Thomas Gleixner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180518103528.59260-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 15 +++++++++++++++ arch/x86/kernel/head64.c | 9 +++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 39ed2e6ff8a0..27f68d14c962 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1028,6 +1028,21 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ setup_clear_cpu_cap(X86_FEATURE_PCID); #endif + + /* + * Later in the boot process pgtable_l5_enabled() relies on + * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not + * enabled by this point we need to clear the feature bit to avoid + * false-positives at the later stage. + * + * pgtable_l5_enabled() can be false here for several reasons: + * - 5-level paging is disabled compile-time; + * - it's 32-bit kernel; + * - machine doesn't support 5-level paging; + * - user specified 'no5lvl' in kernel command line. + */ + if (!pgtable_l5_enabled()) + setup_clear_cpu_cap(X86_FEATURE_LA57); } void __init early_cpu_init(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8d372d1c266d..8047379e575a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -80,10 +80,11 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { - if (native_cpuid_eax(0) < 7) - return false; - - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + /* + * 5-level paging is detected and enabled at kernel decomression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) return false; *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; -- cgit From e4e961e36f063484c48bed919013c106d178995d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 18 May 2018 13:35:28 +0300 Subject: x86/mm: Mark __pgtable_l5_enabled __initdata __pgtable_l5_enabled shouldn't be needed after system has booted. All preparation is done. We can now mark it as __initdata. Signed-off-by: Kirill A. Shutemov Reviewed-by: Thomas Gleixner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180518103528.59260-8-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..a21d6ace648e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -44,7 +44,7 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled __ro_after_init; +unsigned int __pgtable_l5_enabled __initdata; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; -- cgit From 19c635ab24a1e94a759e82bfb34554a6a0db215e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:17 -0700 Subject: x86/intel_rdt/mba_sc: Enable/disable MBA software controller Currently user does memory bandwidth allocation(MBA) by specifying the bandwidth in percentage via the resctrl schemata file: "/sys/fs/resctrl/schemata" Add a new mount option "mba_MBps" to enable the user to specify MBA in MBps: $mount -t resctrl resctrl [-o cdp[,cdpl2][mba_MBps]] /sys/fs/resctrl Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-3-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 8 ++++++++ arch/x86/kernel/cpu/intel_rdt.h | 3 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 589b948e6e01..53ee6838c496 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; + + return r->membw.mba_sc; +} + /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 3fd7a70ee04a..74aee0fdc97c 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -259,6 +259,7 @@ struct rdt_cache { * @min_bw: Minimum memory bandwidth percentage user can request * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale + * @mba_sc: True if MBA software controller(mba_sc) is enabled * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct rdt_membw { @@ -266,6 +267,7 @@ struct rdt_membw { u32 min_bw; u32 bw_gran; u32 delay_linear; + bool mba_sc; u32 *mb_map; }; @@ -445,6 +447,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); +bool is_mba_sc(struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fca759d272a1..440025446239 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1005,6 +1005,11 @@ static void l2_qos_cfg_update(void *arg) wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } +static inline bool is_mba_linear(void) +{ + return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear; +} + static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); @@ -1041,6 +1046,25 @@ static int set_cache_qos_cfg(int level, bool enable) return 0; } +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; + + if (!is_mbm_enabled() || !is_mba_linear() || + mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + + return 0; +} + static int cdp_enable(int level, int data_type, int code_type) { struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; @@ -1123,6 +1147,10 @@ static int parse_rdtgroupfs_options(char *data) ret = cdpl2_enable(); if (ret) goto out; + } else if (!strcmp(token, "mba_MBps")) { + ret = set_mba_sc(true); + if (ret) + goto out; } else { ret = -EINVAL; goto out; @@ -1445,6 +1473,8 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); + set_mba_sc(false); + /*Put everything back to default values. */ for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); -- cgit From 1bd2a63b4f0deefe745aa0fd969c07b2eb9ee99e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:18 -0700 Subject: x86/intel_rdt/mba_sc: Add initialization support When MBA software controller is enabled, a per domain storage is required for user specified bandwidth in "MBps" and the "percentage" values which are programmed into the IA32_MBA_THRTL_MSR. Add support for these data structures and initialization. The MBA percentage values have a default max value of 100 but however the max value in MBps is not available from the hardware so it's set to U32_MAX. This simply says that the control group can use all bandwidth by default but does not say what is the actual max bandwidth available. The actual bandwidth that is available may depend on lot of factors like QPI link, number of memory channels, memory channel frequency, its width and memory speed, how many channels are configured and also if memory interleaving is enabled. So there is no way to determine the maximum at runtime reliably. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-4-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 37 +++++++++++++++++++++++--------- arch/x86/kernel/cpu/intel_rdt.h | 3 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 3 +++ 3 files changed, 33 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 53ee6838c496..8c09e9db2fc6 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -35,6 +35,7 @@ #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 +#define MBA_MAX_MBPS U32_MAX /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); @@ -439,25 +440,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +{ + int i; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100% + * and the bandwidth in MBps to U32_MAX + */ + for (i = 0; i < r->num_closid; i++, dc++, dm++) { + *dc = r->default_ctrl; + *dm = MBA_MAX_MBPS; + } +} + static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) { struct msr_param m; - u32 *dc; - int i; + u32 *dc, *dm; dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); if (!dc) return -ENOMEM; - d->ctrl_val = dc; + dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); + if (!dm) { + kfree(dc); + return -ENOMEM; + } - /* - * Initialize the Control MSRs to having no control. - * For Cache Allocation: Set all bits in cbm - * For Memory Allocation: Set b/w requested to 100 - */ - for (i = 0; i < r->num_closid; i++, dc++) - *dc = r->default_ctrl; + d->ctrl_val = dc; + d->mbps_val = dm; + setup_default_ctrlval(r, dc, dm); m.low = 0; m.high = r->num_closid; @@ -596,6 +612,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) } kfree(d->ctrl_val); + kfree(d->mbps_val); kfree(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 74aee0fdc97c..91cc31087e8a 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -202,6 +202,7 @@ struct mbm_state { * @cqm_work_cpu: * worker cpu for CQM h/w counters * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps * @new_ctrl: new ctrl value to be loaded * @have_new_ctrl: did user provide new_ctrl for this domain */ @@ -217,6 +218,7 @@ struct rdt_domain { int mbm_work_cpu; int cqm_work_cpu; u32 *ctrl_val; + u32 *mbps_val; u32 new_ctrl; bool have_new_ctrl; }; @@ -448,6 +450,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 440025446239..749856a2e736 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1055,12 +1055,15 @@ static int set_cache_qos_cfg(int level, bool enable) static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; + struct rdt_domain *d; if (!is_mbm_enabled() || !is_mba_linear() || mba_sc == is_mba_sc(r)) return -EINVAL; r->membw.mba_sc = mba_sc; + list_for_each_entry(d, &r->domains, list) + setup_default_ctrlval(r, d->ctrl_val, d->mbps_val); return 0; } -- cgit From 8205a078ba7819c23558e31af4b3bda04d9b3bae Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:19 -0700 Subject: x86/intel_rdt/mba_sc: Add schemata support Currently when user updates the "schemata" with new MBA percentage values, kernel writes the corresponding bandwidth percentage values to the IA32_MBA_THRTL_MSR. When MBA is expressed in MBps, the schemata format is changed to have the per package memory bandwidth in MBps instead of being specified in percentage. Do not write the IA32_MBA_THRTL_MSRs when the schemata is updated as that is handled separately. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-5-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 2 +- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 8c09e9db2fc6..ad03d975883e 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -179,7 +179,7 @@ struct rdt_resource rdt_resources_all[] = { .msr_update = mba_wrmsr, .cache_level = 3, .parse_ctrlval = parse_bw, - .format_str = "%d=%*d", + .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, }, }; diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 23e1d5c249c6..116d57b248d3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -53,7 +53,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return false; } - if (bw < r->membw.min_bw || bw > r->default_ctrl) { + if ((bw < r->membw.min_bw || bw > r->default_ctrl) && + !is_mba_sc(r)) { rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, r->membw.min_bw, r->default_ctrl); return false; @@ -179,6 +180,8 @@ static int update_domains(struct rdt_resource *r, int closid) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; + bool mba_sc; + u32 *dc; int cpu; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) @@ -188,13 +191,20 @@ static int update_domains(struct rdt_resource *r, int closid) msr_param.high = msr_param.low + 1; msr_param.res = r; + mba_sc = is_mba_sc(r); list_for_each_entry(d, &r->domains, list) { - if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + dc = !mba_sc ? d->ctrl_val : d->mbps_val; + if (d->have_new_ctrl && d->new_ctrl != dc[closid]) { cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - d->ctrl_val[closid] = d->new_ctrl; + dc[closid] = d->new_ctrl; } } - if (cpumask_empty(cpu_mask)) + + /* + * Avoid writing the control msr with control values when + * MBA software controller is enabled + */ + if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ @@ -282,13 +292,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) { struct rdt_domain *dom; bool sep = false; + u32 ctrl_val; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); + + ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] : + dom->mbps_val[closid]); seq_printf(s, r->format_str, dom->id, max_data_width, - dom->ctrl_val[closid]); + ctrl_val); sep = true; } seq_puts(s, "\n"); -- cgit From ba0f26d8529c2dfc9aa6d9e8a338180737f8c1be Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:20 -0700 Subject: x86/intel_rdt/mba_sc: Prepare for feedback loop This is a preparatory patch for the mba feedback loop. Add support to measure the "bandwidth in MBps" and the "delta bandwidth". Measure it by reading the MBM IA32_QM_CTR MSRs and calculating the amount of "bytes" moved. There is no user space interface for this and will only be used by the feedback loop patch. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-6-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 10 ++++++++ arch/x86/kernel/cpu/intel_rdt_monitor.c | 44 ++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 91cc31087e8a..66a0ba37a8a3 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -180,10 +180,20 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + * @chunks_bw Total local data moved. Used for bandwidth calculation + * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw The most recent bandwidth in MBps + * @delta_bw Difference between the current and previous bandwidth + * @delta_comp Indicates whether to compute the delta_bw */ struct mbm_state { u64 chunks; u64 prev_msr; + u64 chunks_bw; + u64 prev_bw_msr; + u32 prev_bw; + u32 delta_bw; + bool delta_comp; }; /** diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 681450eee428..7690402c42b7 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -225,10 +225,18 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +{ + u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >>= shift; +} + static int __mon_event_count(u32 rmid, struct rmid_read *rr) { - u64 chunks, shift, tval; struct mbm_state *m; + u64 chunks, tval; tval = __rmid_read(rmid, rr->evtid); if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { @@ -254,14 +262,12 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) } if (rr->first) { - m->prev_msr = tval; - m->chunks = 0; + memset(m, 0, sizeof(struct mbm_state)); + m->prev_bw_msr = m->prev_msr = tval; return 0; } - shift = 64 - MBM_CNTR_WIDTH; - chunks = (tval << shift) - (m->prev_msr << shift); - chunks >>= shift; + chunks = mbm_overflow_count(m->prev_msr, tval); m->chunks += chunks; m->prev_msr = tval; @@ -269,6 +275,32 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } +/* + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. + */ +static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + struct mbm_state *m = &rr->d->mbm_local[rmid]; + u64 tval, cur_bw, chunks; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + chunks = mbm_overflow_count(m->prev_bw_msr, tval); + m->chunks_bw += chunks; + m->chunks = m->chunks_bw; + cur_bw = (chunks * r->mon_scale) >> 20; + + if (m->delta_comp) + m->delta_bw = abs(cur_bw - m->prev_bw); + m->delta_comp = false; + m->prev_bw = cur_bw; + m->prev_bw_msr = tval; +} + /* * This is called via IPI to read the CQM/MBM counters * on a domain. -- cgit From de73f38f768021610bd305cf74ef3702fcf6a1eb Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:21 -0700 Subject: x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth mba_sc is a feedback loop where we periodically read MBM counters and try to restrict the bandwidth below a max value so the below is always true: "current bandwidth(cur_bw) < user specified bandwidth(user_bw)" The frequency of these checks is currently 1s and we just tag along the MBM overflow timer to do the updates. Doing it once in a second also makes the calculation of bandwidth easy. The steps of increase or decrease of bandwidth is the minimum granularity specified by the hardware. Although the MBA's goal is to restrict the bandwidth below a maximum, there may be a need to even increase the bandwidth. Since MBA controls the L2 external bandwidth where as MBM measures the L3 external bandwidth, we may end up restricting some rdtgroups unnecessarily. This may happen in the sequence where rdtgroup (set of jobs) had high "L3 <-> memory traffic" in initial phases -> mba_sc kicks in and reduced bandwidth percentage values -> but after some it has mostly "L2 <-> L3" traffic. In this scenario mba_sc increases the bandwidth percentage when there is lesser memory traffic. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-7-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 3 +- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_monitor.c | 126 +++++++++++++++++++++++++++++++- 3 files changed, 128 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ad03d975883e..24bfa63e86cf 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -33,7 +33,6 @@ #include #include "intel_rdt.h" -#define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 #define MBA_MAX_MBPS U32_MAX @@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 66a0ba37a8a3..39752825e376 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -28,6 +28,7 @@ #define MBM_CNTR_WIDTH 24 #define MBM_OVERFLOW_INTERVAL 1000 +#define MAX_MBA_BW 100u #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) @@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 7690402c42b7..b0f3aed76b75 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -329,6 +329,118 @@ void mon_event_count(void *info) } } +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid uncecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + u32 cur_bw, delta_bw, user_bw; + struct rdt_resource *r_mba; + struct rdt_domain *dom_mba; + struct list_head *head; + struct rdtgroup *entry; + + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = &dom_mbm->mbm_local[rmid]; + + dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + delta_bw = pmbm_data->delta_bw; + cur_msr_val = dom_mba->ctrl_val[closid]; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cur_bw += cmbm_data->prev_bw; + delta_bw += cmbm_data->delta_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * + * The delta_bw is used when increasing the bandwidth so that we + * dont alternately increase and decrease the control values + * continuously. + * + * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if + * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep + * switching between 90 and 110 continuously if we only check + * cur_bw < user_bw. + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw + delta_bw))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + cur_msr = r_mba->msr_base + closid; + wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); + dom_mba->ctrl_val[closid] = new_msr_val; + + /* + * Delta values are updated dynamically package wise for each + * rdtgrp everytime the throttle MSR changes value. + * + * This is because (1)the increase in bandwidth is not perfectly + * linear and only "approximately" linear even when the hardware + * says it is linear.(2)Also since MBA is a core specific + * mechanism, the delta values vary based on number of cores used + * by the rdtgrp. + */ + pmbm_data->delta_comp = true; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data->delta_comp = true; + } +} + static void mbm_update(struct rdt_domain *d, int rmid) { struct rmid_read rr; @@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid) } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - __mon_event_count(rmid, &rr); + + /* + * Call the MBA software controller only for the + * control groups and when user has enabled + * the software controller explicitly. + */ + if (!is_mba_sc(NULL)) + __mon_event_count(rmid, &rr); + else + mbm_bw_count(rmid, &rr); } } @@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work) head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) mbm_update(d, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); } schedule_delayed_work_on(cpu, &d->mbm_over, delay); -- cgit From e27c49291a7fe9dc415c9fcab5bd781ec82dfe04 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Apr 2018 22:13:23 +0200 Subject: x86: Convert x86_platform_ops to timespec64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The x86 platform operations are fairly isolated, so it's easy to change them from using timespec to timespec64. It has been checked that all the users and callers are safe, and there is only one critical function that is broken beyond 2106: pvclock_read_wallclock() uses a 32-bit number of seconds since the epoch to communicate the boot time between host and guest in a virtual environment. This will work until 2106, but fixing this is outside the scope of this change, Add a comment at least. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Boris Ostrovsky Acked-by: Radim Krčmář Acked-by: Jan Kiszka Cc: Juergen Gross Cc: jailhouse-dev@googlegroups.com Cc: Borislav Petkov Cc: kvm@vger.kernel.org Cc: y2038@lists.linaro.org Cc: "Rafael J. Wysocki" Cc: xen-devel@lists.xenproject.org Cc: John Stultz Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Andy Shevchenko Cc: Joao Martins Link: https://lkml.kernel.org/r/20180427201435.3194219-1-arnd@arndb.de --- arch/x86/kernel/jailhouse.c | 2 +- arch/x86/kernel/kvmclock.c | 4 ++-- arch/x86/kernel/pvclock.c | 15 +++++++++++---- arch/x86/kernel/rtc.c | 10 +++++----- 4 files changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index a15fe0e92cf9..108c48d0d40e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -37,7 +37,7 @@ static uint32_t __init jailhouse_detect(void) return jailhouse_cpuid_base(); } -static void jailhouse_get_wallclock(struct timespec *now) +static void jailhouse_get_wallclock(struct timespec64 *now) { memset(now, 0, sizeof(*now)); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 8b26c9e01cc4..bf8d1eb7fca3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -53,7 +53,7 @@ static struct pvclock_wall_clock *wall_clock; * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -static void kvm_get_wallclock(struct timespec *now) +static void kvm_get_wallclock(struct timespec64 *now) { struct pvclock_vcpu_time_info *vcpu_time; int low, high; @@ -72,7 +72,7 @@ static void kvm_get_wallclock(struct timespec *now) put_cpu(); } -static int kvm_set_wallclock(const struct timespec *now) +static int kvm_set_wallclock(const struct timespec64 *now) { return -ENODEV; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 761f6af6efa5..637982efecd8 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -123,28 +123,35 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, - struct timespec *ts) + struct timespec64 *ts) { u32 version; u64 delta; - struct timespec now; + struct timespec64 now; /* get wallclock at system boot */ do { version = wall_clock->version; rmb(); /* fetch version before time */ + /* + * Note: wall_clock->sec is a u32 value, so it can + * only store dates between 1970 and 2106. To allow + * times beyond that, we need to create a new hypercall + * interface with an extended pvclock_wall_clock structure + * like ARM has. + */ now.tv_sec = wall_clock->sec; now.tv_nsec = wall_clock->nsec; rmb(); /* fetch time before checking version */ } while ((wall_clock->version & 1) || (version != wall_clock->version)); delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; now.tv_nsec = do_div(delta, NSEC_PER_SEC); now.tv_sec = delta; - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec); } void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index f7b82ed7b5b5..586f718b8e95 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -39,7 +39,7 @@ EXPORT_SYMBOL(rtc_lock); * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec *now) +int mach_set_rtc_mmss(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -60,7 +60,7 @@ int mach_set_rtc_mmss(const struct timespec *now) return retval; } -void mach_get_cmos_time(struct timespec *now) +void mach_get_cmos_time(struct timespec64 *now) { unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; @@ -118,7 +118,7 @@ void mach_get_cmos_time(struct timespec *now) } else year += CMOS_YEARS_OFFS; - now->tv_sec = mktime(year, mon, day, hour, min, sec); + now->tv_sec = mktime64(year, mon, day, hour, min, sec); now->tv_nsec = 0; } @@ -145,13 +145,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -int update_persistent_clock(struct timespec now) +int update_persistent_clock64(struct timespec64 now) { return x86_platform.set_wallclock(&now); } /* not static: needed by APM */ -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { x86_platform.get_wallclock(ts); } -- cgit From 844ea8f62619ac2a5e47d13e3449ac197067b414 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 28 Apr 2018 10:24:48 +0100 Subject: x86/apm: Fix spelling mistake: "caculate" -> "calculate" Trivial fix to spelling mistake in module parameter description text Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Jiri Kosina Cc: kernel-janitors@vger.kernel.org Cc: "H . Peter Anvin" Link: https://lkml.kernel.org/r/20180428092448.6493-1-colin.king@canonical.com --- arch/x86/kernel/apm_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index dfcbe6924eaf..6b345f35f023 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2446,7 +2446,7 @@ MODULE_PARM_DESC(idle_threshold, "System idle percentage above which to make APM BIOS idle calls"); module_param(idle_period, int, 0444); MODULE_PARM_DESC(idle_period, - "Period (in sec/100) over which to caculate the idle percentage"); + "Period (in sec/100) over which to calculate the idle percentage"); module_param(smp, bool, 0444); MODULE_PARM_DESC(smp, "Set this to enable APM use on an SMP platform. Use with caution on older systems"); -- cgit From fbf96cf904dc154a28338fe68f72902e9af57afc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 17 May 2018 18:32:33 +0200 Subject: x86/MCE/AMD: Read MCx_MISC block addresses on any CPU We used rdmsr_safe_on_cpu() to make sure we're reading the proper CPU's MISC block addresses. However, that caused trouble with CPU hotplug due to the _on_cpu() helper issuing an IPI while IRQs are disabled. But we don't have to do that: the block addresses are the same on any CPU so we can read them on any CPU. (What practically happens is, we read them on the BSP and cache them, and for later reads, we service them from the cache). Suggested-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index c8e038800591..f591b01930db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -436,8 +436,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, - unsigned int block) +static u32 smca_get_block_address(unsigned int bank, unsigned int block) { u32 low, high; u32 addr = 0; @@ -456,13 +455,13 @@ static u32 smca_get_block_address(unsigned int cpu, unsigned int bank, * For SMCA enabled processors, BLKPTR field of the first MISC register * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). */ - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) goto out; if (!(low & MCI_CONFIG_MCAX)) goto out; - if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); @@ -471,7 +470,7 @@ out: return addr; } -static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, +static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -480,7 +479,7 @@ static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 hi return addr; if (mce_flags.smca) - return smca_get_block_address(cpu, bank, block); + return smca_get_block_address(bank, block); /* Fall back to method we used for older processors: */ switch (block) { @@ -558,7 +557,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); + address = get_block_address(address, low, high, bank, block); if (!address) break; @@ -1175,7 +1174,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - address = get_block_address(cpu, address, low, high, bank, ++block); + address = get_block_address(address, low, high, bank, ++block); if (!address) return 0; -- cgit From f64c6013a2029316ea552f99823d116ee5f5f955 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2018 09:50:53 -0700 Subject: rcu/x86: Provide early rcu_cpu_starting() callback The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Add !SMP support, per 0day Test Robot report. ] --- arch/x86/kernel/cpu/mtrr/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7468de429087..3ea0047beb40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -793,6 +794,9 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; + + rcu_cpu_starting(smp_processor_id()); + /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, -- cgit From 10b1105004fbd81058383537b67df35cc188ab62 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Thu, 24 May 2018 17:11:54 +0300 Subject: perf/x86: Store user space frame-pointer value on a sample Store user space frame-pointer value (BP register) into the perf trace on a sample for a process so the value becomes available when unwinding call stacks for functions gaining event samples. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/311d4a34-f81b-5535-3385-01427ac73b41@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index e47b2dbbdef3..c06c4c16c6b6 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -151,17 +151,19 @@ void perf_get_regs_user(struct perf_regs *regs_user, regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = user_regs->cs; regs_user_copy->ss = user_regs->ss; - /* - * Most system calls don't save these registers, don't report them. + * Store user space frame-pointer value on sample + * to facilitate stack unwinding for cases when + * user space executable code has such support + * enabled at compile time: */ + regs_user_copy->bp = user_regs->bp; + regs_user_copy->bx = -1; - regs_user_copy->bp = -1; regs_user_copy->r12 = -1; regs_user_copy->r13 = -1; regs_user_copy->r14 = -1; regs_user_copy->r15 = -1; - /* * For this to be at all useful, we need a reasonable guess for * the ABI. Be careful: we're in NMI context, and we're -- cgit From 884571f0de7b02bb784be3a5c870eabce62cdaeb Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Fri, 25 May 2018 13:00:00 +0800 Subject: dma-mapping: remove unused gfp_t parameter to arch_dma_alloc_attrs Signed-off-by: Huaisheng Ye Signed-off-by: Christoph Hellwig --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index bcbaa2e8031e..c7113fd18e32 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -73,7 +73,7 @@ void __init pci_iommu_alloc(void) } } -bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) +bool arch_dma_alloc_attrs(struct device **dev) { if (!*dev) *dev = &x86_dma_fallback_dev; -- cgit From 06e9552f5f12564dc1c3483f0934d96cc4f72f18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Apr 2018 09:13:24 +0200 Subject: x86/pci-dma: remove the experimental forcesac boot option Limiting the dma mask to avoid PCI (pre-PCIe) DAC cycles while paying the huge overhead of an IOMMU is rather pointless, and this seriously gets in the way of dma mapping work. Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Gleixner --- arch/x86/kernel/pci-dma.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c7113fd18e32..b65b0d7072f1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -20,8 +20,6 @@ static int forbid_dac __read_mostly; const struct dma_map_ops *dma_ops = &dma_direct_ops; EXPORT_SYMBOL(dma_ops); -static int iommu_sac_force __read_mostly; - #ifdef CONFIG_IOMMU_DEBUG int panic_on_overflow __read_mostly = 1; int force_iommu __read_mostly = 1; @@ -122,7 +120,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "nomerge", 7)) iommu_merge = 0; if (!strncmp(p, "forcesac", 8)) - iommu_sac_force = 1; + pr_warn("forcesac option ignored.\n"); if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) @@ -162,23 +160,6 @@ int arch_dma_supported(struct device *dev, u64 mask) } #endif - /* Tell the device to use SAC when IOMMU force is on. This - allows the driver to use cheaper accesses in some cases. - - Problem with this is that if we overflow the IOMMU area and - return DAC as fallback address the device may not handle it - correctly. - - As a special case some controllers have a 39bit address - mode that is as efficient as 32bit (aic79xx). Don't force - SAC for these. Assume all masks <= 40 bits are of this - type. Normally this doesn't make any difference, but gives - more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { - dev_info(dev, "Force SAC with mask %Lx\n", mask); - return 0; - } - return 1; } EXPORT_SYMBOL(arch_dma_supported); -- cgit From 098afd981744061d37eb4d40bc3f755438570afb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Apr 2018 09:31:47 +0200 Subject: x86/pci-dma: remove the explicit nodac and allowdac option This is something drivers should decide (modulo chipset quirks like for VIA), which as far as I can tell is how things have been handled for the last 15 years. Note that we keep the usedac option for now, as it is used in the wild to override the too generic VIA quirk. Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Gleixner --- arch/x86/kernel/pci-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b65b0d7072f1..b523414bc323 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -122,9 +122,9 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "forcesac", 8)) pr_warn("forcesac option ignored.\n"); if (!strncmp(p, "allowdac", 8)) - forbid_dac = 0; + pr_warn("allowdac option ignored.\n"); if (!strncmp(p, "nodac", 5)) - forbid_dac = 1; + pr_warn("nodac option ignored.\n"); if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; -- cgit From 0ead51c3fbd15a4bc91e984f1b18b5c9422fbb02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 May 2018 12:47:57 +0200 Subject: x86/pci-dma: switch the VIA 32-bit DMA quirk to use the struct device flag Instead of globally disabling > 32bit DMA using the arch_dma_supported hook walk the PCI bus under the actually affected bridge and mark every device with the dma_32bit_limit flag. This also gets rid of the arch_dma_supported hook entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Gleixner --- arch/x86/kernel/pci-dma.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b523414bc323..ab5d9dd668d2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,7 +15,7 @@ #include #include -static int forbid_dac __read_mostly; +static bool disable_dac_quirk __read_mostly; const struct dma_map_ops *dma_ops = &dma_direct_ops; EXPORT_SYMBOL(dma_ops); @@ -126,7 +126,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "nodac", 5)) pr_warn("nodac option ignored.\n"); if (!strncmp(p, "usedac", 6)) { - forbid_dac = -1; + disable_dac_quirk = true; return 1; } #ifdef CONFIG_SWIOTLB @@ -151,19 +151,6 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int arch_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI - if (mask > 0xffffffff && forbid_dac > 0) { - dev_info(dev, "PCI: Disallowing DAC for device\n"); - return 0; - } -#endif - - return 1; -} -EXPORT_SYMBOL(arch_dma_supported); - static int __init pci_iommu_init(void) { struct iommu_table_entry *p; @@ -186,11 +173,17 @@ rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.dma_32bit_limit = true; + return 0; +} + static void via_no_dac(struct pci_dev *dev) { - if (forbid_dac == 0) { + if (!disable_dac_quirk) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); - forbid_dac = 1; + pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL); } } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, -- cgit From d6761b8fd96967f7ff4b16c6875e94929a897916 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 2 Jun 2018 08:43:58 -0400 Subject: x86: Add support for restartable sequences Call the rseq_handle_notify_resume() function on return to userspace if TIF_NOTIFY_RESUME thread flag is set. Perform fixup on the pre-signal frame when a signal is delivered on top of a restartable sequence critical section. Check that system calls are not invoked from within rseq critical sections by invoking rseq_signal() from syscall_return_slowpath(). With CONFIG_DEBUG_RSEQ, such behavior results in termination of the process with SIGSEGV. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Joel Fernandes Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Dave Watson Cc: Will Deacon Cc: Andi Kleen Cc: "H . Peter Anvin" Cc: Chris Lameter Cc: Russell King Cc: Andrew Hunter Cc: Michael Kerrisk Cc: "Paul E . McKenney" Cc: Paul Turner Cc: Boqun Feng Cc: Josh Triplett Cc: Steven Rostedt Cc: Ben Maurer Cc: linux-api@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20180602124408.8430-7-mathieu.desnoyers@efficios.com --- arch/x86/kernel/signal.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da270b95fe4d..445ca11ff863 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -688,6 +688,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; + /* + * Increment event counter and perform fixup for the pre-signal + * frame. + */ + rseq_signal_deliver(regs); + /* Set up the stack frame */ if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) -- cgit From 24809860012e0130fbafe536709e08a22b3e959e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 1 Jun 2018 10:59:19 -0400 Subject: x86/bugs: Add AMD's variant of SSB_NO The AMD document outlining the SSBD handling 124441_AMD64_SpeculativeStoreBypassDisable_Whitepaper_final.pdf mentions that the CPUID 8000_0008.EBX[26] will mean that the speculative store bypass disable is no longer needed. A copy of this document is available at: https://bugzilla.kernel.org/show_bug.cgi?id=199889 Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Janakarajan Natarajan Cc: kvm@vger.kernel.org Cc: andrew.cooper3@citrix.com Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180601145921.9500-2-konrad.wilk@oracle.com --- arch/x86/kernel/cpu/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 95c8e507580d..8e3f062f462d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -992,7 +992,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); if (!x86_match_cpu(cpu_no_spec_store_bypass) && - !(ia32_cap & ARCH_CAP_SSB_NO)) + !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (x86_match_cpu(cpu_no_meltdown)) -- cgit From 6ac2f49edb1ef5446089c7c660017732886d62d6 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 1 Jun 2018 10:59:20 -0400 Subject: x86/bugs: Add AMD's SPEC_CTRL MSR usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AMD document outlining the SSBD handling 124441_AMD64_SpeculativeStoreBypassDisable_Whitepaper_final.pdf mentions that if CPUID 8000_0008.EBX[24] is set we should be using the SPEC_CTRL MSR (0x48) over the VIRT SPEC_CTRL MSR (0xC001_011f) for speculative store bypass disable. This in effect means we should clear the X86_FEATURE_VIRT_SSBD flag so that we would prefer the SPEC_CTRL MSR. See the document titled: 124441_AMD64_SpeculativeStoreBypassDisable_Whitepaper_final.pdf A copy of this document is available at https://bugzilla.kernel.org/show_bug.cgi?id=199889 Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Janakarajan Natarajan Cc: kvm@vger.kernel.org Cc: KarimAllah Ahmed Cc: andrew.cooper3@citrix.com Cc: Joerg Roedel Cc: Radim Krčmář Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Borislav Petkov Cc: David Woodhouse Cc: Kees Cook Link: https://lkml.kernel.org/r/20180601145921.9500-3-konrad.wilk@oracle.com --- arch/x86/kernel/cpu/bugs.c | 12 +++++++----- arch/x86/kernel/cpu/common.c | 6 ++++++ 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7416fc206b4a..6bea81855cdd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -529,18 +529,20 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) if (mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* - * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses - * a completely different MSR and bit dependent on family. + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may + * use a completely different MSR and bit dependent on family. */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: + case X86_VENDOR_AMD: + if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + x86_amd_ssb_disable(); + break; + } x86_spec_ctrl_base |= SPEC_CTRL_SSBD; x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); break; - case X86_VENDOR_AMD: - x86_amd_ssb_disable(); - break; } } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8e3f062f462d..910b47ee8078 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -803,6 +803,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_STIBP); set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); } + + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { + set_cpu_cap(c, X86_FEATURE_SSBD); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); + } } void get_cpu_cap(struct cpuinfo_x86 *c) -- cgit From 108fab4b5c8f12064ef86e02cb0459992affb30f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 1 Jun 2018 10:59:21 -0400 Subject: x86/bugs: Switch the selection of mitigation from CPU vendor to CPU features Both AMD and Intel can have SPEC_CTRL_MSR for SSBD. However AMD also has two more other ways of doing it - which are !SPEC_CTRL MSR ways. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Cc: Kees Cook Cc: kvm@vger.kernel.org Cc: KarimAllah Ahmed Cc: andrew.cooper3@citrix.com Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180601145921.9500-4-konrad.wilk@oracle.com --- arch/x86/kernel/cpu/bugs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6bea81855cdd..cd0fda1fff6d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -532,17 +532,12 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - case X86_VENDOR_AMD: - if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - x86_amd_ssb_disable(); - break; - } + if (!static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + x86_amd_ssb_disable(); + else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - break; } } -- cgit From 6da2ec56059c3c7a7e5f729e6349e74ace1e5c57 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 13:55:00 -0700 Subject: treewide: kmalloc() -> kmalloc_array() The kmalloc() function has a 2-factor argument form, kmalloc_array(). This patch replaces cases of: kmalloc(a * b, gfp) with: kmalloc_array(a * b, gfp) as well as handling cases of: kmalloc(a * b * c, gfp) with: kmalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kmalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kmalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The tools/ directory was manually excluded, since it has its own implementation of kmalloc(). The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(char) * COUNT + COUNT , ...) | kmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kmalloc + kmalloc_array ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kmalloc(C1 * C2 * C3, ...) | kmalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kmalloc(sizeof(THING) * C2, ...) | kmalloc(sizeof(TYPE) * C2, ...) | kmalloc(C1 * C2 * C3, ...) | kmalloc(C1 * C2, ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - (E1) * E2 + E1, E2 , ...) | - kmalloc + kmalloc_array ( - (E1) * (E2) + E1, E2 , ...) | - kmalloc + kmalloc_array ( - E1 * E2 + E1, E2 , ...) ) Signed-off-by: Kees Cook --- arch/x86/kernel/hpet.c | 4 ++-- arch/x86/kernel/ksysfs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b6be34ee88e9..ddccdea0b63b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -966,8 +966,8 @@ int __init hpet_enable(void) #endif cfg = hpet_readl(HPET_CFG); - hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), - GFP_KERNEL); + hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg), + GFP_KERNEL); if (hpet_boot_cfg) *hpet_boot_cfg = cfg; else diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 8c1cc08f514f..163ae706a0d4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -283,7 +283,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) if (ret) goto out_setup_data_kobj; - kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); + kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL); if (!kobjp) { ret = -ENOMEM; goto out_setup_data_kobj; -- cgit From 6396bb221514d2876fd6dc0aa2a1f240d99b37bb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:03:40 -0700 Subject: treewide: kzalloc() -> kcalloc() The kzalloc() function has a 2-factor argument form, kcalloc(). This patch replaces cases of: kzalloc(a * b, gfp) with: kcalloc(a * b, gfp) as well as handling cases of: kzalloc(a * b * c, gfp) with: kzalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kzalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kzalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kzalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kzalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kzalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(char) * COUNT + COUNT , ...) | kzalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kzalloc + kcalloc ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kzalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kzalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kzalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kzalloc(C1 * C2 * C3, ...) | kzalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kzalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kzalloc(sizeof(THING) * C2, ...) | kzalloc(sizeof(TYPE) * C2, ...) | kzalloc(C1 * C2 * C3, ...) | kzalloc(C1 * C2, ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - (E1) * E2 + E1, E2 , ...) | - kzalloc + kcalloc ( - (E1) * (E2) + E1, E2 , ...) | - kzalloc + kcalloc ( - E1 * E2 + E1, E2 , ...) ) Signed-off-by: Kees Cook --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- arch/x86/kernel/cpu/mtrr/if.c | 2 +- arch/x86/kernel/hpet.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd76380af79f..e4cf6ff1c2e1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1457,7 +1457,7 @@ static int __mcheck_cpu_mce_banks_init(void) int i; u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f591b01930db..dd33c357548f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1384,7 +1384,7 @@ int mce_threshold_create_device(unsigned int cpu) if (bp) return 0; - bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), GFP_KERNEL); if (!bp) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index c610f47373e4..4021d3859499 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -43,7 +43,7 @@ mtrr_file_add(unsigned long base, unsigned long size, max = num_var_ranges; if (fcount == NULL) { - fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL); + fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL); if (!fcount) return -ENOMEM; FILE_FCOUNT(file) = fcount; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ddccdea0b63b..346b24883911 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -610,7 +610,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) if (!hpet_domain) return; - hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL); if (!hpet_devs) return; -- cgit From 050e9baa9dc9fbd9ce2b27f0056990fc9e0a08a0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Jun 2018 12:21:18 +0900 Subject: Kbuild: rename CC_STACKPROTECTOR[_STRONG] config variables The changes to automatically test for working stack protector compiler support in the Kconfig files removed the special STACKPROTECTOR_AUTO option that picked the strongest stack protector that the compiler supported. That was all a nice cleanup - it makes no sense to have the AUTO case now that the Kconfig phase can just determine the compiler support directly. HOWEVER. It also meant that doing "make oldconfig" would now _disable_ the strong stackprotector if you had AUTO enabled, because in a legacy config file, the sane stack protector configuration would look like CONFIG_HAVE_CC_STACKPROTECTOR=y # CONFIG_CC_STACKPROTECTOR_NONE is not set # CONFIG_CC_STACKPROTECTOR_REGULAR is not set # CONFIG_CC_STACKPROTECTOR_STRONG is not set CONFIG_CC_STACKPROTECTOR_AUTO=y and when you ran this through "make oldconfig" with the Kbuild changes, it would ask you about the regular CONFIG_CC_STACKPROTECTOR (that had been renamed from CONFIG_CC_STACKPROTECTOR_REGULAR to just CONFIG_CC_STACKPROTECTOR), but it would think that the STRONG version used to be disabled (because it was really enabled by AUTO), and would disable it in the new config, resulting in: CONFIG_HAVE_CC_STACKPROTECTOR=y CONFIG_CC_HAS_STACKPROTECTOR_NONE=y CONFIG_CC_STACKPROTECTOR=y # CONFIG_CC_STACKPROTECTOR_STRONG is not set CONFIG_CC_HAS_SANE_STACKPROTECTOR=y That's dangerously subtle - people could suddenly find themselves with the weaker stack protector setup without even realizing. The solution here is to just rename not just the old RECULAR stack protector option, but also the strong one. This does that by just removing the CC_ prefix entirely for the user choices, because it really is not about the compiler support (the compiler support now instead automatially impacts _visibility_ of the options to users). This results in "make oldconfig" actually asking the user for their choice, so that we don't have any silent subtle security model changes. The end result would generally look like this: CONFIG_HAVE_CC_STACKPROTECTOR=y CONFIG_CC_HAS_STACKPROTECTOR_NONE=y CONFIG_STACKPROTECTOR=y CONFIG_STACKPROTECTOR_STRONG=y CONFIG_CC_HAS_SANE_STACKPROTECTOR=y where the "CC_" versions really are about internal compiler infrastructure, not the user selections. Acked-by: Masahiro Yamada Signed-off-by: Linus Torvalds --- arch/x86/kernel/asm-offsets.c | 2 +- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/head_32.S | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 76417a9aab73..dcb008c320fe 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,7 @@ void common(void) { BLANK(); OFFSET(TASK_threadsp, task_struct, thread.sp); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR OFFSET(TASK_stack_canary, task_struct, stack_canary); #endif diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index f91ba53e06c8..a4a3be399f4b 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR BLANK(); OFFSET(stack_canary_offset, stack_canary, canary); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bf51e51d808d..b2dcd161f514 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -69,7 +69,7 @@ int main(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 910b47ee8078..0df7151cfef4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1599,7 +1599,7 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = (unsigned long)&init_thread_union + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b59e4fb40fd9..abe6df15a8fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -375,7 +375,7 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by * relocation. Manually set base address in stack canary -- cgit