From 395810627b6a43c8d0ec948884043946fa162308 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Jun 2011 16:09:21 +0900 Subject: x86: Swap save_stack_trace_regs parameters Swap the 1st and 2nd parameters of save_stack_trace_regs() as same as the parameters of save_stack_trace_tsk(). Signed-off-by: Masami Hiramatsu Cc: yrl.pp-manager.tt@hitachi.com Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20110608070921.17777.31103.stgit@fedora15 Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/mm/kmemcheck/error.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 55d9bc03f696..fdd0c6430e5a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 704a37cedddb..dab41876cdd5 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_regs(&e->trace, regs); + save_stack_trace_regs(regs, &e->trace); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address -- cgit From a0e3e70243f5b270bc3eca718f0a9fa5e6b8262e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 3 Jun 2011 16:37:47 +0200 Subject: oprofile, x86: Fix nmi-unsafe callgraph support Current oprofile's x86 callgraph support may trigger page faults throwing the BUG_ON(in_nmi()) message below. This patch fixes this by using the same nmi-safe copy-from-user code as in perf. ------------[ cut here ]------------ kernel BUG at .../arch/x86/kernel/traps.c:436! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:00/0000:00:0a.0/0000:07:00.0/0000:08:04.0/net/eth0/broadcast CPU 5 Modules linked in: Pid: 8611, comm: opcontrol Not tainted 2.6.39-00007-gfe47ae7 #1 Advanced Micro Device Anaheim/Anaheim RIP: 0010:[] [] do_nmi+0x22/0x1ee RSP: 0000:ffff88042fd47f28 EFLAGS: 00010002 RAX: ffff88042c0a7fd8 RBX: 0000000000000001 RCX: 00000000c0000101 RDX: 00000000ffff8804 RSI: ffffffffffffffff RDI: ffff88042fd47f58 RBP: ffff88042fd47f48 R08: 0000000000000004 R09: 0000000000001484 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88042fd47f58 R13: 0000000000000000 R14: ffff88042fd47d98 R15: 0000000000000020 FS: 00007fca25e56700(0000) GS:ffff88042fd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000074 CR3: 000000042d28b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process opcontrol (pid: 8611, threadinfo ffff88042c0a6000, task ffff88042c532310) Stack: 0000000000000000 0000000000000001 ffff88042c0a7fd8 0000000000000000 ffff88042fd47de8 ffffffff813e897a 0000000000000020 ffff88042fd47d98 0000000000000000 ffff88042c0a7fd8 ffff88042fd47de8 0000000000000074 Call Trace: [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Code: ff 59 5b 41 5c 41 5d c9 c3 55 65 48 8b 04 25 88 b5 00 00 48 89 e5 41 55 41 54 49 89 fc 53 48 83 ec 08 f6 80 47 e0 ff ff 04 74 04 <0f> 0b eb fe 81 80 44 e0 ff ff 00 00 01 04 65 ff 04 25 c4 0f 01 RIP [] do_nmi+0x22/0x1ee RSP ---[ end trace ed6752185092104b ]--- Kernel panic - not syncing: Fatal exception in interrupt Pid: 8611, comm: opcontrol Tainted: G D 2.6.39-00007-gfe47ae7 #1 Call Trace: [] panic+0x8c/0x188 [] oops_end+0x81/0x8e [] die+0x55/0x5e [] do_trap+0x11c/0x12b [] do_invalid_op+0x91/0x9a [] ? do_nmi+0x22/0x1ee [] ? oprofile_add_sample+0x83/0x95 [] ? op_amd_check_ctrs+0x4f/0x2cf [] invalid_op+0x15/0x20 [] ? do_nmi+0x22/0x1ee [] ? do_nmi+0x67/0x1ee [] nmi+0x1a/0x20 [] ? bad_to_user+0x25/0x771 <> Cc: John Lumby Cc: Maynard Johnson Cc: # .37+ Signed-off-by: Robert Richter --- arch/x86/oprofile/backtrace.c | 56 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index a5b64ab4cd6e..32f78eb46744 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -11,10 +11,12 @@ #include #include #include +#include +#include + #include #include #include -#include static int backtrace_stack(void *data, char *name) { @@ -36,17 +38,53 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; +/* from arch/x86/kernel/cpu/perf_event.c: */ + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame_ia32 bufhead[2]; struct stack_frame_ia32 *fp; + unsigned long bytes; - /* Also check accessibility of one struct frame_head beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); @@ -87,12 +125,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) static struct stack_frame *dump_user_backtrace(struct stack_frame *head) { + /* Also check accessibility of one struct frame_head beyond: */ struct stack_frame bufhead[2]; + unsigned long bytes; - /* Also check accessibility of one struct stack_frame beyond */ - if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) - return NULL; - if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != sizeof(bufhead)) return NULL; oprofile_add_trace(bufhead[0].return_address); -- cgit From 1880c4ae182afb5650c5678949ecfe7ff66a724e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jun 2011 16:49:18 +0400 Subject: perf, x86: Add hw_watchdog_set_attr() in a sake of nmi-watchdog on P4 Due to restriction and specifics of Netburst PMU we need a separated event for NMI watchdog. In particular every Netburst event consumes not just a counter and a config register, but also an additional ESCR register. Since ESCR registers are grouped upon counters (i.e. if ESCR is occupied for some event there is no room for another event to enter until its released) we need to pick up the "least" used ESCR (or the most available one) for nmi-watchdog purposes -- so MSR_P4_CRU_ESCR2/3 was chosen. With this patch nmi-watchdog and perf top should be able to run simultaneously. Signed-off-by: Cyrill Gorcunov CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Tested-and-reviewed-by: Don Zickus Tested-and-reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110623124918.GC13050@sun Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ arch/x86/kernel/cpu/perf_event_p4.c | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3a0338b4b179..8a57f9aa8e36 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -233,6 +233,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -315,6 +316,12 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + if (x86_pmu.hw_watchdog_set_attr) + x86_pmu.hw_watchdog_set_attr(wd_attr); +} + /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ead584fb6a7d..f76fddf63381 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -705,6 +705,31 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } +static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) +{ + /* + * Watchdog ticks are special on Netburst, we use + * that named "non-sleeping" ticks as recommended + * by Intel SDM Vol3b. + */ + WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || + wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); + + wd_attr->type = PERF_TYPE_RAW; + wd_attr->config = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE); +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1179,6 +1204,7 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* -- cgit From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++-- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/mm/fault.c | 6 +++--- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8a57f9aa8e36..5b86ec51534c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1339,7 +1339,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 41178c826c48..d38b0020f775 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,7 +1003,7 @@ again: data.period = event->hw.last_period; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index bab491b8ee25..0941f93f2940 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + if (perf_output_begin(&handle, event, header.size * (top - at), 1)) return 1; for (; at < top; at++) { @@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; - if (perf_event_overflow(event, 1, &data, ®s)) + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index f76fddf63381..d6e6a67b9608 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -970,7 +970,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5f9ecff328b5..98da6a7b5e82 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -608,7 +608,7 @@ int kgdb_arch_init(void) return register_die_notifier(&kgdb_notifier); } -static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, +static void kgdb_hw_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 807c2a2b80f1..11db2e9b860a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, int nmi, +static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2dbf6bf4c7e5..4d09df054e39 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1161,11 +1161,11 @@ good_area: if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { -- cgit From a7ac67ea021b4603095d2aa458bc41641238f22c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 16:47:16 +0200 Subject: perf: Remove the perf_output_begin(.sample) argument Since only samples call perf_output_sample() its much saner (and more correct) to put the sample logic in there than in the perf_output_begin()/perf_output_end() pair. Saves a useless argument, reduces conditionals and shrinks struct perf_output_handle, win! Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0941f93f2940..1b1ef3addcfd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1)) + if (perf_output_begin(&handle, event, header.size * (top - at))) return 1; for (; at < top; at++) { -- cgit From efc9f05df2dd171280dcb736a4d973ffefd5508e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:03 +0200 Subject: perf_events: Update Intel extra regs shared constraints management This patch improves the code managing the extra shared registers used for offcore_response events on Intel Nehalem/Westmere. The idea is to use static allocation instead of dynamic allocation. This simplifies greatly the get and put constraint routines for those events. The patch also renames per_core to shared_regs because the same data structure gets used whether or not HT is on. When HT is off, those events still need to coordination because they use a extra MSR that has to be shared within an event group. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145703.GA7258@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 78 +++++++--- arch/x86/kernel/cpu/perf_event_intel.c | 260 ++++++++++++++++----------------- 2 files changed, 189 insertions(+), 149 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5b86ec51534c..019fda7489e7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -44,6 +44,29 @@ do { \ } while (0) #endif +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + /* * best effort, GUP based copy_from_user() that assumes IRQ or NMI context */ @@ -132,11 +155,10 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* - * Intel percore register state. - * Coordinate shared resources between HT threads. + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB */ - int percore_used; /* Used by this CPU? */ - struct intel_percore *per_core; + struct intel_shared_regs *shared_regs; /* * AMD specific bits @@ -186,27 +208,46 @@ struct cpu_hw_events { #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->weight; (e)++) +/* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + /* * Extra registers for specific events. + * * Some events need large masks and require external MSRs. - * Define a mapping to these extra registers. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. */ struct extra_reg { unsigned int event; unsigned int msr; u64 config_mask; u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ }; -#define EVENT_EXTRA_REG(e, ms, m, vm) { \ +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ .event = (e), \ .msr = (ms), \ .config_mask = (m), \ .valid_mask = (vm), \ + .idx = EXTRA_REG_##i \ } -#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) union perf_capabilities { struct { @@ -253,7 +294,6 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -400,10 +440,10 @@ static inline unsigned int x86_pmu_event_addr(int index) */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct hw_perf_event_extra *reg; struct extra_reg *er; - event->hw.extra_reg = 0; - event->hw.extra_config = 0; + reg = &event->hw.extra_reg; if (!x86_pmu.extra_regs) return 0; @@ -413,8 +453,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; - event->hw.extra_reg = er->msr; - event->hw.extra_config = event->attr.config1; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; break; } return 0; @@ -713,6 +755,9 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } @@ -754,8 +799,8 @@ static void x86_pmu_disable(struct pmu *pmu) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { - if (hwc->extra_reg) - wrmsrl(hwc->extra_reg, hwc->extra_config); + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); wrmsrl(hwc->config_base, hwc->config | enable_mask); } @@ -1692,7 +1737,6 @@ static int validate_group(struct perf_event *event) fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); if (!fake_cpuc) goto out; - /* * the event is not yet connected with its * siblings therefore we must first collect diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d38b0020f775..6ad95baff856 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,25 +1,15 @@ #ifdef CONFIG_CPU_SUP_INTEL -#define MAX_EXTRA_REGS 2 - -/* - * Per register state. - */ -struct er_account { - int ref; /* reference count */ - unsigned int extra_reg; /* extra MSR number */ - u64 extra_config; /* extra MSR config */ -}; - /* - * Per core state - * This used to coordinate shared registers for HT threads. + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. */ -struct intel_percore { - raw_spinlock_t lock; /* protect structure */ - struct er_account regs[MAX_EXTRA_REGS]; - int refcnt; /* number of threads */ - unsigned core_id; +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ }; /* @@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -125,18 +109,11 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - INTEL_EVENT_CONSTRAINT(0xbb, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1037,65 +1014,89 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ static struct event_constraint * -intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) { - struct hw_perf_event *hwc = &event->hw; - unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; - struct event_constraint *c; - struct intel_percore *pc; + struct event_constraint *c = &emptyconstraint; struct er_account *era; - int i; - int free_slot; - int found; - if (!x86_pmu.percore_constraints || hwc->extra_alloc) - return NULL; + /* already allocated shared msr */ + if (reg->alloc || !cpuc->shared_regs) + return &unconstrained; - for (c = x86_pmu.percore_constraints; c->cmask; c++) { - if (e != c->code) - continue; + era = &cpuc->shared_regs->regs[reg->idx]; + + raw_spin_lock(&era->lock); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* no need to reallocate during incremental event scheduling */ + reg->alloc = 1; /* - * Allocate resource per core. + * All events using extra_reg are unconstrained. + * Avoids calling x86_get_event_constraints() + * + * Must revisit if extra_reg controlling events + * ever have constraints. Worst case we go through + * the regular event constraint table. */ - pc = cpuc->per_core; - if (!pc) - break; - c = &emptyconstraint; - raw_spin_lock(&pc->lock); - free_slot = -1; - found = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { - /* Allow sharing same config */ - if (hwc->extra_config == era->extra_config) { - era->ref++; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - /* else conflict */ - found = 1; - break; - } else if (era->ref == 0 && free_slot == -1) - free_slot = i; - } - if (!found && free_slot != -1) { - era = &pc->regs[free_slot]; - era->ref = 1; - era->extra_reg = hwc->extra_reg; - era->extra_config = hwc->extra_config; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - raw_spin_unlock(&pc->lock); - return c; + c = &unconstrained; } + raw_spin_unlock(&era->lock); - return NULL; + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * only put constraint if extra reg was actually + * allocated. Also takes care of event which do + * not use an extra shared reg + */ + if (!reg->alloc) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL; + struct hw_perf_event_extra *xreg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, xreg); + return c; } static struct event_constraint * @@ -1111,49 +1112,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - c = intel_percore_constraints(cpuc, event); + c = intel_shared_regs_constraints(cpuc, event); if (c) return c; return x86_get_event_constraints(cpuc, event); } -static void intel_put_event_constraints(struct cpu_hw_events *cpuc, +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct extra_reg *er; - struct intel_percore *pc; - struct er_account *era; - struct hw_perf_event *hwc = &event->hw; - int i, allref; - - if (!cpuc->percore_used) - return; + struct hw_perf_event_extra *reg; - for (er = x86_pmu.extra_regs; er->msr; er++) { - if (er->event != (hwc->config & er->config_mask)) - continue; + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} - pc = cpuc->per_core; - raw_spin_lock(&pc->lock); - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && - era->extra_config == hwc->extra_config && - era->extra_reg == er->msr) { - era->ref--; - hwc->extra_alloc = 0; - break; - } - } - allref = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) - allref += pc->regs[i].ref; - if (allref == 0) - cpuc->percore_used = 0; - raw_spin_unlock(&pc->lock); - break; - } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); } static int intel_pmu_hw_config(struct perf_event *event) @@ -1231,20 +1211,36 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!cpu_has_ht_siblings()) + if (!x86_pmu.extra_regs) return NOTIFY_OK; - cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), - GFP_KERNEL, cpu_to_node(cpu)); - if (!cpuc->per_core) + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) return NOTIFY_BAD; - raw_spin_lock_init(&cpuc->per_core->lock); - cpuc->per_core->core_id = -1; return NOTIFY_OK; } @@ -1260,32 +1256,34 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpu_has_ht_siblings()) + if (!cpuc->shared_regs) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + struct intel_shared_regs *pc; + pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->per_core); - cpuc->per_core = pc; + kfree(cpuc->shared_regs); + cpuc->shared_regs = pc; break; } } - cpuc->per_core->core_id = core_id; - cpuc->per_core->refcnt++; + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_percore *pc = cpuc->per_core; + struct intel_shared_regs *pc; + pc = cpuc->shared_regs; if (pc) { if (pc->core_id == -1 || --pc->refcnt == 0) kfree(pc); - cpuc->per_core = NULL; + cpuc->shared_regs = NULL; } fini_debug_store_on_cpu(cpu); @@ -1436,7 +1434,6 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; - x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; @@ -1481,7 +1478,6 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; - x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; -- cgit From cd8a38d33e2528998998bae70a45ad27e442f114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:08 +0200 Subject: perf_events: Fix validation of events using an extra reg The validate_group() function needs to validate events with extra shared regs. Within an event group, only events with the same value for the extra reg can co-exist. This was not checked by validate_group() because it was missing the shared_regs logic. This patch changes the allocation of the fake cpuc used for validation to also point to a fake shared_regs structure such that group events be properly testing. It modifies __intel_shared_reg_get_constraints() to use spin_lock_irqsave() to avoid lockdep issues. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145708.GA7279@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++++++++-------- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++--- 2 files changed, 57 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 019fda7489e7..9a0f55c99b6e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1689,6 +1689,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu) perf_pmu_enable(pmu); return 0; } +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} /* * validate that we can schedule this event @@ -1699,9 +1733,9 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - return -ENOMEM; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, event); @@ -1711,7 +1745,7 @@ static int validate_event(struct perf_event *event) if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); - kfree(fake_cpuc); + free_fake_cpuc(fake_cpuc); return ret; } @@ -1731,35 +1765,32 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret, n; + int ret = -ENOSPC, n; - ret = -ENOMEM; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - goto out; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ - ret = -ENOSPC; n = collect_events(fake_cpuc, leader, true); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); -out_free: - kfree(fake_cpuc); out: + free_fake_cpuc(fake_cpuc); return ret; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6ad95baff856..ac02b83e8614 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,14 +1027,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, { struct event_constraint *c = &emptyconstraint; struct er_account *era; + unsigned long flags; /* already allocated shared msr */ - if (reg->alloc || !cpuc->shared_regs) + if (reg->alloc) return &unconstrained; era = &cpuc->shared_regs->regs[reg->idx]; - - raw_spin_lock(&era->lock); + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); if (!atomic_read(&era->ref) || era->config == reg->config) { @@ -1058,7 +1062,7 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, */ c = &unconstrained; } - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); return c; } @@ -1524,4 +1528,8 @@ static int intel_pmu_init(void) return 0; } +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} #endif /* CONFIG_CPU_SUP_INTEL */ -- cgit From ee89cbc2d48150c7c0e9f2aaac00afde99af098c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:12 +0200 Subject: perf_events: Add Intel Sandy Bridge offcore_response low-level support This patch adds Intel Sandy Bridge offcore_response support by providing the low-level constraint table for those events. On Sandy Bridge, there are two offcore_response events. Each uses its own dedictated extra register. But those registers are NOT shared between sibling CPUs when HT is on unlike Nehalem/Westmere. They are always private to each CPU. But they still need to be controlled within an event group. All events within an event group must use the same value for the extra MSR. That's not controlled by the second patch in this series. Furthermore on Sandy Bridge, the offcore_response events have NO counter constraints contrary to what the official documentation indicates, so drop the events from the contraint table. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145712.GA7304@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9a0f55c99b6e..583f3113436d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,6 +327,7 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; + bool regs_no_ht_sharing; }; static struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ac02b83e8614..a674ae45a472 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -100,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ - INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ EVENT_CONSTRAINT_END @@ -122,6 +120,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + EVENT_EXTRA_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -1260,7 +1264,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs) + if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1502,6 +1506,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.regs_no_ht_sharing = true; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit From b79e8941fb9af07d810da91b4e29da2bba331b6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2011 11:08:15 +0200 Subject: perf, intel: Try alternative OFFCORE encodings Since the OFFCORE registers are fully symmetric, try the other one when the specified one is already in use. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306141897.18455.8.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 5 +++- arch/x86/kernel/cpu/perf_event_intel.c | 44 ++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 583f3113436d..c53d433c3dde 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -327,9 +327,12 @@ struct x86_pmu { * Extra registers for events */ struct extra_reg *extra_regs; - bool regs_no_ht_sharing; + unsigned int er_flags; }; +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a674ae45a472..5c448622468c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1018,6 +1018,29 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return false; + + if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.idx = EXTRA_REG_RSP_1; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01b7; + event->hw.extra_reg.idx = EXTRA_REG_RSP_0; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } + + if (event->hw.extra_reg.idx == orig_idx) + return false; + + return true; +} + /* * manage allocation of shared extra msr for certain events * @@ -1027,16 +1050,19 @@ intel_bts_constraints(struct perf_event *event) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct hw_perf_event_extra *reg) + struct perf_event *event) { struct event_constraint *c = &emptyconstraint; + struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; + int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) return &unconstrained; +again: era = &cpuc->shared_regs->regs[reg->idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when @@ -1065,6 +1091,9 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, * the regular event constraint table. */ c = &unconstrained; + } else if (intel_try_alt_er(event, orig_idx)) { + raw_spin_unlock(&era->lock); + goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1099,11 +1128,10 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c = NULL; - struct hw_perf_event_extra *xreg; - xreg = &event->hw.extra_reg; - if (xreg->idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, xreg); + if (event->hw.extra_reg.idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, event); + return c; } @@ -1264,7 +1292,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) + if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1489,6 +1517,7 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; @@ -1508,7 +1537,8 @@ static __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_snb_pebs_events; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.regs_no_ht_sharing = true; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit From 89d6c0b5bdbb1927775584dcf532d98b3efe1477 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 23:37:06 +0200 Subject: perf, arch: Add generic NODE cache events Add a NODE level to the generic cache events which is used to measure local vs remote memory accesses. Like all other cache events, an ACCESS is HIT+MISS, if there is no way to distinguish between reads and writes do reads only etc.. The below needs filling out for !x86 (which I filled out with unsupported events). I'm fairly sure ARM can leave it like that since it doesn't strike me as an architecture that even has NUMA support. SH might have something since it does appear to have some NUMA bits. Sparc64, PowerPC and MIPS certainly want a good look there since they clearly are NUMA capable. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Anton Blanchard Cc: David Daney Cc: Deng-Cheng Zhu Cc: Paul Mundt Cc: Will Deacon Cc: Robert Richter Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1303508226.4865.8.camel@laptop Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 14 ++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 59 +++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event_p4.c | 14 ++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index fe29c1d2219e..941caa2e449b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; /* diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5c448622468c..bf6f92f7c19d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -226,6 +226,21 @@ static __initconst const u64 snb_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + }; static __initconst const u64 westmere_hw_cache_event_ids @@ -327,6 +342,20 @@ static __initconst const u64 westmere_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; /* @@ -379,7 +408,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, - } + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, + }, + }, }; static __initconst const u64 nehalem_hw_cache_event_ids @@ -481,6 +524,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; static __initconst const u64 core2_hw_cache_event_ids diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d6e6a67b9608..fb901c5080f7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -554,6 +554,20 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p4_general_events[PERF_COUNT_HW_MAX] = { -- cgit From 4dc0da86967d5463708631d02a70cfed5b104884 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:35 +0300 Subject: perf: Add context field to perf_event The perf_event overflow handler does not receive any caller-derived argument, so many callers need to resort to looking up the perf_event in their local data structure. This is ugly and doesn't scale if a single callback services many perf_events. Fix by adding a context parameter to perf_event_create_kernel_counter() (and derived hardware breakpoints APIs) and storing it in the perf_event. The field can be accessed from the callback as event->overflow_handler_context. All callers are updated. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-2-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 2 +- arch/x86/kernel/ptrace.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 98da6a7b5e82..00354d4919a9 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -638,7 +638,7 @@ void kgdb_arch_late(void) for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; - breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 11db2e9b860a..82528799c5de 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); /* * CHECKME: the previous code returned -EIO if the addr wasn't -- cgit From 0af3ac1fdb9d5c297b4b07c9e0172531d42b6716 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:36 +0300 Subject: x86, perf: Add constraints for architectural PMU The v1 PMU does not have any fixed counters. Using the v2 constraints, which do have fixed counters, causes an additional choice to be present in the weight calculation, but not when actually scheduling the event, leading to an event being not scheduled at all. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-3-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index bf6f92f7c19d..45fbb8f7f549 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -112,6 +112,11 @@ static struct extra_reg intel_westmere_extra_regs[] __read_mostly = EVENT_EXTRA_END }; +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1606,11 +1611,19 @@ static __init int intel_pmu_init(void) break; default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } } return 0; } -- cgit From 9e46294dadedc0c04adcb8ce760bd2cd74f7332d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:00:52 +0200 Subject: x86: Save stack pointer in perf live regs savings In order to prepare for fetching the stack pointer from the regs when possible in dump_trace() instead of taking the local one, save the current stack pointer in perf live regs saving. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d9d4dae305f6..094fb30817ab 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); (regs)->bp = caller_frame_pointer(); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ } #else -- cgit From 47ce11a2b6519f9c7843223ea8e561eb71ea5896 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 Jun 2011 19:04:56 +0200 Subject: x86: Fetch stack from regs when possible in dump_trace() When regs are passed to dump_stack(), we fetch the frame pointer from the regs but the stack pointer is taken from the current frame. Thus the frame and stack pointers may not come from the same context. For example this can result in the unwinder to think the context is in irq, due to the current value of the stack, but the frame pointer coming from the regs points to a frame from another place. It then tries to fix up the irq link but ends up dereferencing a random frame pointer that doesn't belong to the irq stack: [ 9131.706906] ------------[ cut here ]------------ [ 9131.707003] WARNING: at arch/x86/kernel/dumpstack_64.c:129 dump_trace+0x2aa/0x330() [ 9131.707003] Hardware name: AMD690VM-FMH [ 9131.707003] Perf: bad frame pointer = 0000000000000005 in callchain [ 9131.707003] Modules linked in: [ 9131.707003] Pid: 1050, comm: perf Not tainted 3.0.0-rc3+ #181 [ 9131.707003] Call Trace: [ 9131.707003] [] warn_slowpath_common+0x7a/0xb0 [ 9131.707003] [] warn_slowpath_fmt+0x41/0x50 [ 9131.707003] [] ? bad_to_user+0x6d/0x10be [ 9131.707003] [] dump_trace+0x2aa/0x330 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] perf_callchain_kernel+0x54/0x70 [ 9131.707003] [] perf_prepare_sample+0x19f/0x2a0 [ 9131.707003] [] __perf_event_overflow+0x16c/0x290 [ 9131.707003] [] ? __perf_event_overflow+0x130/0x290 [ 9131.707003] [] ? native_sched_clock+0x13/0x50 [ 9131.707003] [] ? sched_clock+0x9/0x10 [ 9131.707003] [] ? T.375+0x15/0x90 [ 9131.707003] [] ? trace_hardirqs_on_caller+0x64/0x180 [ 9131.707003] [] ? trace_hardirqs_off+0xd/0x10 [ 9131.707003] [] perf_event_overflow+0x14/0x20 [ 9131.707003] [] perf_swevent_hrtimer+0x11c/0x130 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] __run_hrtimer+0x83/0x1e0 [ 9131.707003] [] ? perf_event_overflow+0x20/0x20 [ 9131.707003] [] hrtimer_interrupt+0x106/0x250 [ 9131.707003] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 9131.707003] [] smp_apic_timer_interrupt+0x53/0x90 [ 9131.707003] [] apic_timer_interrupt+0x13/0x20 [ 9131.707003] [] ? error_exit+0x51/0xb0 [ 9131.707003] [] ? error_exit+0x4c/0xb0 [ 9131.707003] ---[ end trace b2560d4876709347 ]--- Fix this by simply taking the stack pointer from regs->sp when regs are provided. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index e71c98d3c0d2..788295cbe4a7 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -155,9 +155,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, task = current; if (!stack) { - stack = &dummy; - if (task && task != current) + if (regs) + stack = (unsigned long *)regs->sp; + else if (task && task != current) stack = (unsigned long *)task->thread.sp; + else + stack = &dummy; } if (!bp) -- cgit From 1871853f7abc3c727c4346539c5062cbeaf016a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 01:51:22 +0200 Subject: x86,64: Simplify save_regs() The save_regs function that saves the regs on low level irq entry is complicated because of the fact it changes its stack in the middle and also because it manipulates data allocated in the caller frame and accesses there are directly calculated from callee rsp value with the return address in the middle of the way. This complicates the static stack offsets calculation and require more dynamic ones. It also needs a save/restore of the function's return address. To simplify and optimize this, turn save_regs() into a macro. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0c989e..b6b2e85454cf 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -297,27 +297,22 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ - .pushsection .kprobes.text, "ax" -ENTRY(save_args) - XCPT_FRAME + .macro SAVE_ARGS_IRQ cld - /* - * start from rbp in pt_regs and jump over - * return address. - */ - movq_cfi rdi, RDI+8-RBP - movq_cfi rsi, RSI+8-RBP - movq_cfi rdx, RDX+8-RBP - movq_cfi rcx, RCX+8-RBP - movq_cfi rax, RAX+8-RBP - movq_cfi r8, R8+8-RBP - movq_cfi r9, R9+8-RBP - movq_cfi r10, R10+8-RBP - movq_cfi r11, R11+8-RBP - - leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ - movq_cfi rbp, 8 /* push %rbp */ - leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 0 /* push %rbp */ + movq %rsp, %rbp testl $3, CS(%rdi) je 1f SWAPGS @@ -329,19 +324,14 @@ ENTRY(save_args) */ 1: incl PER_CPU_VAR(irq_count) jne 2f - popq_cfi %rax /* move return address... */ mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 pushq_cfi %rbp /* backlink for unwinder */ - pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ 2: TRACE_IRQS_OFF - ret - CFI_ENDPROC -END(save_args) - .popsection + .endm ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -791,7 +781,7 @@ END(interrupt) /* reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP - call save_args + SAVE_ARGS_IRQ PARTIAL_FRAME 0 call \func .endm -- cgit From 3b99a3ef55b292180473a221f3d6bc24455f0632 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 1 Jul 2011 02:25:17 +0200 Subject: x86,64: Separate arg1 from rbp handling in SAVE_REGS_IRQ Just for clarity in the code. Have a first block that handles the frame pointer and a separate one that handles pt_regs pointer and its use. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b6b2e85454cf..20dc8e6b6d80 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,9 +310,10 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - leaq -RBP(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 0 /* push %rbp */ movq %rsp, %rbp + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) je 1f SWAPGS -- cgit From 48ffee7d9e6df51b4957bed64115b7beed671374 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 15:03:44 +0200 Subject: x86: Remove useless unwinder backlink from irq regs saving The unwinder backlink in interrupt entry is very useless. It's actually not part of the stack frame chain and thus is never used. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 20dc8e6b6d80..6131432c5afa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -327,7 +327,6 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - pushq_cfi %rbp /* backlink for unwinder */ /* * We entered an interrupt context - irqs are off: */ -- cgit From a2bbe75089d5eb9a3a46d50dd5c215e213790288 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 2 Jul 2011 16:52:45 +0200 Subject: x86: Don't use frame pointer to save old stack on irq entry rbp is used in SAVE_ARGS_IRQ to save the old stack pointer in order to restore it later in ret_from_intr. It is convenient because we save its value in the irq regs and it's easily restored using the leave instruction. However this is a kind of abuse of the frame pointer which role is to help unwinding the kernel by chaining frames together, each node following the return address to the previous frame. But although we are breaking the frame by changing the stack pointer, there is no preceding return address before the new frame. Hence using the frame pointer to link the two stacks breaks the stack unwinders that find a random value instead of a return address here. There is no workaround that can work in every case. We are using the fixup_bp_irq_link() function to dereference that abused frame pointer in the case of non nesting interrupt (which means stack changed). But that doesn't fix the case of interrupts that don't change the stack (but we still have the unconditional frame link), which is the case of hardirq interrupting softirq. We have no way to detect this transition so the frame irq link is considered as a real frame pointer and the return address is dereferenced but it is still a spurious one. There are two possible results of this: either the spurious return address, a random stack value, luckily belongs to the kernel text and then the unwinding can continue and we just have a weird entry in the stack trace. Or it doesn't belong to the kernel text and unwinding stops there. This is the reason why stacktraces (including perf callchains) on irqs that interrupted softirqs don't work very well. To solve this, we don't save the old stack pointer on rbp anymore but we save it to a scratch register that we push on the new stack and that we pop back later on irq return. This preserves the whole frame chain without spurious return addresses in the middle and drops the need for the horrid fixup_bp_irq_link() workaround. And finally irqs that interrupt softirq are sanely unwinded. Before: 99.81% perf [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--41.60%-- __read | | | |--99.90%-- create_worker | | bench_sched_messaging | | cmd_bench | | run_builtin | | main | | __libc_start_main | --0.10%-- [...] After: 1.64% swapper [kernel.kallsyms] [k] perf_pending_event | --- perf_pending_event irq_work_run smp_irq_work_interrupt irq_work_interrupt | |--95.00%-- arch_irq_work_raise | irq_work_queue | __perf_event_overflow | perf_swevent_overflow | perf_swevent_event | perf_tp_event | perf_trace_softirq | __do_softirq | call_softirq | do_softirq | irq_exit | | | |--73.68%-- smp_apic_timer_interrupt | | apic_timer_interrupt | | | | | |--96.43%-- amd_e400_idle | | | cpu_idle | | | start_secondary Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jan Beulich --- arch/x86/kernel/dumpstack_64.c | 30 ------------------------------ arch/x86/kernel/entry_64.S | 27 +++++++++++++++------------ 2 files changed, 15 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 788295cbe4a7..19853ad8afc5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -104,34 +104,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } -/* - * We are returning from the irq stack and go to the previous one. - * If the previous stack is also in the irq stack, then bp in the first - * frame of the irq stack points to the previous, interrupted one. - * Otherwise we have another level of indirection: We first save - * the bp of the previous stack, then we switch the stack to the irq one - * and save a new bp that links to the previous one. - * (See save_args()) - */ -static inline unsigned long -fixup_bp_irq_link(unsigned long bp, unsigned long *stack, - unsigned long *irq_stack, unsigned long *irq_stack_end) -{ -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long next; - - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { - if (!probe_kernel_address(&frame->next_frame, next)) - return next; - else - WARN_ONCE(1, "Perf: bad frame pointer = %p in " - "callchain\n", &frame->next_frame); - } -#endif - return bp; -} - /* * x86-64 can have up to three kernel stacks: * process stack @@ -208,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); - bp = fixup_bp_irq_link(bp, stack, irq_stack, - irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6131432c5afa..d656f68371a4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -310,8 +310,11 @@ ENDPROC(native_usergs_sysret64) movq_cfi r10, R10-RBP movq_cfi r11, R11-RBP - movq_cfi rbp, 0 /* push %rbp */ - movq %rsp, %rbp + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ testl $3, CS(%rdi) @@ -327,10 +330,11 @@ ENDPROC(native_usergs_sysret64) jne 2f mov PER_CPU_VAR(irq_stack_ptr),%rsp EMPTY_FRAME 0 - /* - * We entered an interrupt context - irqs are off: - */ -2: TRACE_IRQS_OFF + +2: /* Store previous stack value */ + pushq %rsi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF .endm ENTRY(save_rest) @@ -804,15 +808,14 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) - leaveq - CFI_RESTORE rbp + /* Restore saved previous stack */ + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -16 - /* we did not save rbx, restore only from ARGOFFSET */ - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) -- cgit From e08fbb78f03fe2c4f88824faf6f51ce6af185e11 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Jul 2011 23:04:36 -0400 Subject: tracing, x86/irq: Do not trace arch_local_{*,irq_*}() functions I triggered a triple fault with gcc 4.5.1 because it did not honor the inline annotation to arch_local_save_flags() function and that function was added to the pool of functions traced by the function tracer. When preempt_schedule() called arch_local_save_flags() (called by irqs_disabled()), it was traced, but the first thing the function tracer does is disable preemption. When it enables preemption, the NEED_RESCHED flag will not have been cleared and the preemption check will trigger the call to preempt_schedule() again. Although the dynamic function tracer crashed immediately, the static version of the function tracer (CONFIG_DYNAMIC_FTRACE is not set) actually was able to show where the problem was. swapper-1 3.N.. 103885us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103886us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103887us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule swapper-1 3.N.. 103888us : arch_local_save_flags <-preempt_schedule It went on for a while before it triple faulted with a corrupted stack. The arch_local_save_flags and arch_local_irq_* functions should not be traced. Even though they are marked as inline, gcc may still make them a function and enable tracing of them. The simple solution is to just mark them as notrace. I had to add the for this file to include the notrace tag. Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/20110702033852.733414762@goodmis.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 5745ce8bf108..bba3cf88e624 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -60,23 +60,24 @@ static inline void native_halt(void) #include #else #ifndef __ASSEMBLY__ +#include -static inline unsigned long arch_local_save_flags(void) +static inline notrace unsigned long arch_local_save_flags(void) { return native_save_fl(); } -static inline void arch_local_irq_restore(unsigned long flags) +static inline notrace void arch_local_irq_restore(unsigned long flags) { native_restore_fl(flags); } -static inline void arch_local_irq_disable(void) +static inline notrace void arch_local_irq_disable(void) { native_irq_disable(); } -static inline void arch_local_irq_enable(void) +static inline notrace void arch_local_irq_enable(void) { native_irq_enable(); } @@ -102,7 +103,7 @@ static inline void halt(void) /* * For spinlocks, etc: */ -static inline unsigned long arch_local_irq_save(void) +static inline notrace unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); -- cgit From f91298709790b9a483752ca3c967845537df2af3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 9 Jul 2011 00:17:12 +0400 Subject: perf, x86: P4 PMU - Introduce event alias feature Instead of hw_nmi_watchdog_set_attr() weak function and appropriate x86_pmu::hw_watchdog_set_attr() call we introduce even alias mechanism which allow us to drop this routines completely and isolate quirks of Netburst architecture inside P4 PMU code only. The main idea remains the same though -- to allow nmi-watchdog and perf top run simultaneously. Note the aliasing mechanism applies to generic PERF_COUNT_HW_CPU_CYCLES event only because arbitrary event (say passed as RAW initially) might have some additional bits set inside ESCR register changing the behaviour of event and we can't guarantee anymore that alias event will give the same result. P.S. Thanks a huge to Don and Steven for for testing and early review. Acked-by: Don Zickus Tested-by: Steven Rostedt Signed-off-by: Cyrill Gorcunov CC: Ingo Molnar CC: Peter Zijlstra CC: Stephane Eranian CC: Lin Ming CC: Arnaldo Carvalho de Melo CC: Frederic Weisbecker Link: http://lkml.kernel.org/r/20110708201712.GS23657@sun Signed-off-by: Steven Rostedt --- arch/x86/include/asm/perf_event_p4.h | 33 +++++++++ arch/x86/kernel/cpu/perf_event.c | 7 -- arch/x86/kernel/cpu/perf_event_p4.c | 135 +++++++++++++++++++++++++++-------- 3 files changed, 139 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 56fd9e3abbda..4d86c86178f2 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -101,6 +101,14 @@ #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) +/* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + /* * The bits we allow to pass for RAW events */ @@ -123,6 +131,31 @@ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) +/* + * In case of event aliasing we need to preserve some + * caller bits otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c53d433c3dde..b7a010fce8c3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -274,7 +274,6 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); - void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -360,12 +359,6 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - if (x86_pmu.hw_watchdog_set_attr) - x86_pmu.hw_watchdog_set_attr(wd_attr); -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index fb901c5080f7..8b7a0c306784 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -570,11 +570,92 @@ static __initconst const u64 p4_hw_cache_event_ids }, }; +/* + * Because of Netburst being quite restricted in now + * many same events can run simultaneously, we use + * event aliases, ie different events which have the + * same functionallity but use non-intersected resources + * (ESCR/CCCR/couter registers). This allow us to run + * two or more semi-same events together. It is done + * transparently to a user space. + * + * Never set any cusom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up-to-dated automatically either not appliable + * at all. + * + * And be really carefull choosing aliases! + */ +struct p4_event_alias { + u64 orig; + u64 alter; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with + * non-sleeping cycles (see Intel SDM Vol3b for + * details). + */ + .orig = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alter = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Probably we're lucky and don't have to do + * matching over all config bits. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + /* + * If an event was previously swapped to the alter config + * we should swap it back otherwise contnention on registers + * will return back. + */ + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].orig) { + config_match = p4_event_aliases[i].alter; + break; + } else if (config_match == p4_event_aliases[i].alter) { + config_match = p4_event_aliases[i].orig; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | + (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -719,31 +800,6 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } -static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - /* - * Watchdog ticks are special on Netburst, we use - * that named "non-sleeping" ticks as recommended - * by Intel SDM Vol3b. - */ - WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || - wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); - - wd_attr->type = PERF_TYPE_RAW; - wd_attr->config = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE); -} - static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1159,6 +1215,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1167,6 +1225,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * Aliases are swappable so we may hit circular + * lock if both original config and alias need + * resources (MSR registers) which already busy. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1180,8 +1249,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Probably an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } p4_pmu_swap_config_ts(hwc, cpu); if (assign) @@ -1218,7 +1296,6 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* -- cgit From f53173e47dee5f7514d264796bec58d43ed0f67f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Jul 2011 20:06:25 +0400 Subject: x86, perf: P4 PMU - Fix typos in comments and style cleanup This patch: - fixes typos in comments and clarifies the text - renames obscure p4_event_alias::original and ::alter members to ::original and ::alternative as appropriate - drops parenthesis from the return of p4_get_alias_event() No functional changes. Reported-by: Ingo Molnar Signed-off-by: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20110721160625.GX7492@sun Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event_p4.h | 2 +- arch/x86/kernel/cpu/perf_event_p4.c | 66 ++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 4d86c86178f2..4f7e67e2345e 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -133,7 +133,7 @@ /* * In case of event aliasing we need to preserve some - * caller bits otherwise the mapping won't be complete. + * caller bits, otherwise the mapping won't be complete. */ #define P4_CONFIG_EVENT_ALIAS_MASK \ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 8b7a0c306784..7809d2bcb209 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -571,35 +571,34 @@ static __initconst const u64 p4_hw_cache_event_ids }; /* - * Because of Netburst being quite restricted in now - * many same events can run simultaneously, we use - * event aliases, ie different events which have the - * same functionallity but use non-intersected resources - * (ESCR/CCCR/couter registers). This allow us to run - * two or more semi-same events together. It is done - * transparently to a user space. + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). * - * Never set any cusom internal bits such as P4_CONFIG_HT, - * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are - * either up-to-dated automatically either not appliable - * at all. + * This allow us to relax restrictions a bit and run two or more + * identical events together. * - * And be really carefull choosing aliases! + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. */ struct p4_event_alias { - u64 orig; - u64 alter; + u64 original; + u64 alternative; } p4_event_aliases[] = { { /* - * Non-halted cycles can be substituted with - * non-sleeping cycles (see Intel SDM Vol3b for - * details). + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. */ - .orig = + .original = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), - .alter = + .alternative = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| @@ -620,25 +619,21 @@ static u64 p4_get_alias_event(u64 config) int i; /* - * Probably we're lucky and don't have to do - * matching over all config bits. + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. */ if (!(config & P4_CONFIG_ALIASABLE)) return 0; config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; - /* - * If an event was previously swapped to the alter config - * we should swap it back otherwise contnention on registers - * will return back. - */ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { - if (config_match == p4_event_aliases[i].orig) { - config_match = p4_event_aliases[i].alter; + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; break; - } else if (config_match == p4_event_aliases[i].alter) { - config_match = p4_event_aliases[i].orig; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; break; } } @@ -646,8 +641,7 @@ static u64 p4_get_alias_event(u64 config) if (i >= ARRAY_SIZE(p4_event_aliases)) return 0; - return config_match | - (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); } static u64 p4_general_events[PERF_COUNT_HW_MAX] = { @@ -1229,9 +1223,9 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign again: /* - * Aliases are swappable so we may hit circular - * lock if both original config and alias need - * resources (MSR registers) which already busy. + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. */ if (pass > 2) goto done; @@ -1251,7 +1245,7 @@ again: cntr_idx = p4_next_cntr(thread, used_mask, bind); if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { /* - * Probably an event alias is still available. + * Check whether an event alias is still available. */ config_alias = p4_get_alias_event(hwc->config); if (!config_alias) -- cgit From 1ac2e6ca44e13a087eb7438d284f887097ee7e84 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jun 2011 11:49:55 +0200 Subject: x86, perf: Make copy_from_user_nmi() a library function copy_from_user_nmi() is used in oprofile and perf. Moving it to other library functions like copy_from_user(). As this is x86 code for 32 and 64 bits, create a new file usercopy.c for unified code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110607172413.GJ20052@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 3 +++ arch/x86/kernel/cpu/perf_event.c | 35 -------------------------------- arch/x86/lib/Makefile | 2 +- arch/x86/lib/usercopy.c | 43 ++++++++++++++++++++++++++++++++++++++++ arch/x86/oprofile/backtrace.c | 39 +----------------------------------- 5 files changed, 48 insertions(+), 74 deletions(-) create mode 100644 arch/x86/lib/usercopy.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99ddd148a760..36361bf6fdd1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; }; #endif /* CONFIG_X86_WP_WORKS_OK */ +extern unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b7a010fce8c3..4ee3abf20ed6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -67,40 +66,6 @@ enum extra_reg_type { EXTRA_REG_MAX /* number of entries needed */ }; -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2479f19ddde..6ba477342b8e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o -lib-y += usercopy_$(BITS).o getuser.o putuser.o +lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c new file mode 100644 index 000000000000..97be9cb54483 --- /dev/null +++ b/arch/x86/lib/usercopy.c @@ -0,0 +1,43 @@ +/* + * User address space access functions. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include + +/* + * best effort, GUP based copy_from_user() that is NMI-safe + */ +unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page); + memcpy(to, map+offset, size); + kunmap_atomic(map); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} +EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 32f78eb46744..bff89dfe3619 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -12,10 +12,9 @@ #include #include #include -#include +#include #include -#include #include static int backtrace_stack(void *data, char *name) @@ -38,42 +37,6 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; -/* from arch/x86/kernel/cpu/perf_event.c: */ - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * dump_user_backtrace_32(struct stack_frame_ia32 *head) -- cgit