From c5cb5a2d8d7dc872cf1504091ad0e59fe5ff7cb5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 30 Jun 2009 17:08:14 -0400 Subject: kprobes: Clean up insn_pages by using list instead of hlist Use struct list instead of struct hlist for managing insn_pages, because insn_pages doesn't use hash table. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli LKML-Reference: <20090630210814.17851.64651.stgit@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516a..6fe9dc6d1a81 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) struct kprobe_insn_page { - struct hlist_node hlist; + struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; @@ -117,7 +117,7 @@ enum kprobe_slot_state { }; static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static struct hlist_head kprobe_insn_pages; +static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -152,10 +152,9 @@ loop_end: static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; - struct hlist_node *pos; retry: - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) kfree(kip); return NULL; } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); + INIT_LIST_HEAD(&kip->list); + list_add(&kip->list, &kprobe_insn_pages); memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); kip->slot_used[0] = SLOT_USED; kip->nused = 1; @@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { + if (!list_is_singular(&kprobe_insn_pages)) { + list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); } @@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) static int __kprobes collect_garbage_slots(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos, *next; + struct kprobe_insn_page *kip, *next; /* Ensure no-one is preepmted on the garbages */ if (check_safety()) return -EAGAIN; - hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { + list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; if (kip->ngarbage == 0) continue; @@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void) void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; - struct hlist_node *pos; mutex_lock(&kprobe_insn_mutex); - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; if (dirty) { kip->slot_used[i] = SLOT_DIRTY; kip->ngarbage++; - } else { + } else collect_one_slot(kip, i); - } break; } } -- cgit From 020e5f85cb087a40572c8b8b2dd06292a14fa212 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Jul 2009 10:47:05 +0800 Subject: tracing/events: Add trace_event boot option We already have ftrace= boot option, and this adds a similar boot option for trace events, so allow trace events to be enabled at boot, for boot debugging purpose. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4ACE29.3010407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 37 +++++++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa8..bdb3afc8b306 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,7 +49,7 @@ unsigned long __read_mostly tracing_thresh; * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -static int ring_buffer_expanded; +int ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -63,7 +63,7 @@ static bool __read_mostly tracing_selftest_running; /* * If a tracer is running, we do not want to run SELFTEST. */ -static bool __read_mostly tracing_selftest_disabled; +bool __read_mostly tracing_selftest_disabled; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc780..52eb0d8dcd75 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -517,6 +517,9 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; + #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a88..fecac1314cbe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -17,6 +17,8 @@ #include #include +#include + #include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" @@ -1133,6 +1135,18 @@ struct notifier_block trace_module_nb = { extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + static __init int event_trace_init(void) { struct ftrace_event_call *call; @@ -1140,6 +1154,8 @@ static __init int event_trace_init(void) struct dentry *entry; struct dentry *d_events; int ret; + char *buf = bootup_event_buf; + char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1185,6 +1201,19 @@ static __init int event_trace_init(void) &ftrace_event_format_fops); } + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); + } + ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); @@ -1392,10 +1421,10 @@ static __init void event_trace_self_test_with_function(void) static __init int event_trace_self_tests_init(void) { - - event_trace_self_tests(); - - event_trace_self_test_with_function(); + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } return 0; } -- cgit From ddc1637af217dbd8bc51f30e6d24e84476a869a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 3 Jul 2009 17:34:24 +0800 Subject: kmemtrace: Print binary output only if 'bin' option is set Currently by default the output of kmemtrace is binary format instead of human-readable output. This patch makes the following changes: - We'll see human-readable output by default - We'll see binary output if 'bin' option is set Note: you may probably need to explicitly disable context-info binary output: # echo 0 > options/context-info # echo 1 > options/bin # cat trace_pipe v2: - use %pF to print call_site Signed-off-by: Li Zefan Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4DD0A0.5060500@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 120 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 1edaa9516e81..74903b62bcb6 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, + (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, + (unsigned long)entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { - struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", + entry->type_id, (void *)entry->call_site, + (unsigned long)entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; struct kmemtrace_user_event *ev; + struct kmemtrace_user_event_alloc *ev_alloc; + + trace_assign_type(entry, iter->ent); ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) @@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; struct kmemtrace_user_event *ev; + trace_assign_type(entry, iter->ent); + ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) return TRACE_TYPE_PARTIAL_LINE; @@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter, /* The two other following provide a more minimalistic output */ static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_compress(struct trace_iterator *iter) { + struct kmemtrace_alloc_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Alloc entry */ ret = trace_seq_printf(s, " + "); if (!ret) @@ -362,12 +406,14 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_compress(struct trace_iterator *iter) { + struct kmemtrace_free_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Free entry */ ret = trace_seq_printf(s, " - "); if (!ret) @@ -421,32 +467,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - switch (entry->type) { - case TRACE_KMEM_ALLOC: { - struct kmemtrace_alloc_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_alloc_compress(iter, field); - else - return kmemtrace_print_alloc_user(iter, field); - } - - case TRACE_KMEM_FREE: { - struct kmemtrace_free_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_free_compress(iter, field); - else - return kmemtrace_print_free_user(iter, field); - } + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return TRACE_TYPE_UNHANDLED; + switch (entry->type) { + case TRACE_KMEM_ALLOC: + return kmemtrace_print_alloc_compress(iter); + case TRACE_KMEM_FREE: + return kmemtrace_print_free_compress(iter); default: return TRACE_TYPE_UNHANDLED; } } +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .trace = kmemtrace_print_alloc_user, + .binary = kmemtrace_print_alloc_user_bin, +}; + +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .trace = kmemtrace_print_free_user, + .binary = kmemtrace_print_free_user_bin, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, @@ -463,6 +508,21 @@ void kmemtrace_init(void) static int __init init_kmem_tracer(void) { - return register_tracer(&kmem_tracer); + if (!register_ftrace_event(&kmem_trace_alloc)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_ftrace_event(&kmem_trace_free)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_tracer(&kmem_tracer)) { + pr_warning("Warning: could not register the kmem tracer\n"); + return 1; + } + + return 0; } device_initcall(init_kmem_tracer); -- cgit From 3adc54fa82a68be1cd1ac82ad786ee362796e50a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Mar 2009 15:32:01 -0400 Subject: ring-buffer: make the buffer a true circular link list This patch changes the ring buffer data pages from using a link list head pointer, to making each buffer page point to another buffer page and never back to a "head". This makes the handling of the ring buffer less complex, since the traversing of the ring buffer pages no longer needs to account for the head pointer. This change also is needed to make the ring buffer lockless. [ Changes in version 2: - Added change that Lai Jiangshan mentioned. From: Lai Jiangshan Date: Thu, 11 Jun 2009 11:25:48 +0800 LKML-Reference: <4A30793C.6090208@cn.fujitsu.com> I'm not sure whether these 4 lines: bpage = list_entry(pages.next, struct buffer_page, list); list_del_init(&bpage->list); cpu_buffer->pages = &bpage->list; list_splice(&pages, cpu_buffer->pages); equal to these 2 lines: cpu_buffer->pages = pages.next; list_del(&pages); If there are equivalent, I think the second one are simpler. It may be not a really necessarily cleanup. What I asked is: if there are equivalent, could you use these two line: cpu_buffer->pages = pages.next; list_del(&pages); ] [ Impact: simplify the ring buffer to help make it lockless ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..7c0168ad6d51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -406,7 +406,7 @@ struct ring_buffer_per_cpu { spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -521,12 +521,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); @@ -541,7 +542,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -573,7 +580,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -594,7 +600,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; return cpu_buffer; @@ -609,15 +615,20 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -760,14 +771,14 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; - p = cpu_buffer->pages.next; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; rb_reset_cpu(cpu_buffer); @@ -795,7 +806,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -992,9 +1003,6 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, { struct list_head *p = (*bpage)->list.next; - if (p == &cpu_buffer->pages) - p = p->next; - *bpage = list_entry(p, struct buffer_page, list); } @@ -2247,6 +2255,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; + local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -2719,7 +2734,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); -- cgit From 77ae365eca895061c8bf2b2e3ae1d9ea62869739 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Mar 2009 11:00:29 -0400 Subject: ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 886 +++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.c | 3 - 2 files changed, 738 insertions(+), 151 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c0168ad6d51..e648ba4f70e0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -322,6 +322,14 @@ struct buffer_data_page { unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ @@ -330,6 +338,21 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -403,7 +426,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -411,13 +434,12 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the give page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = (unsigned long)cmpxchg(&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, &new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + */ + local_cmpxchg(&next_page->write, old_write, val); + local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -501,11 +902,16 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,8 +919,12 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } @@ -533,6 +943,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -586,6 +999,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -603,6 +1018,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -620,6 +1037,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); + rb_head_page_deactivate(cpu_buffer); + if (head) { list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); @@ -770,6 +1189,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; @@ -800,6 +1221,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; @@ -809,6 +1233,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -958,22 +1383,15 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -981,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -993,19 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1031,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1040,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1051,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1085,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1129,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event, } } +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1207,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + next_page->page->time_stamp = *ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1305,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } @@ -1324,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ @@ -1368,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1882,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1975,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; return ret; @@ -1996,32 +2581,12 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); -/** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer @@ -2037,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2060,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + local_read(&cpu_buffer->overrun)) - cpu_buffer->read; } return entries; @@ -2083,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2096,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2214,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2247,11 +2815,17 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); - reader = cpu_buffer->head_page; + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -2262,22 +2836,35 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ cpu_buffer->pages = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * If we did not convert it, then we must try again. */ - cpu_buffer->head_page = cpu_buffer->reader_page; + if (!ret) + goto spin; - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; @@ -2733,6 +3320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); @@ -2750,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -3107,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb3afc8b306..b591f7a1bd7b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3630,9 +3630,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); - trace_seq_printf(s, "nmi dropped: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit From c5cb183608167c744cb28bbd85884be5a4ce875d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:20:12 +0800 Subject: tracing/filter: Remove preds from struct event_subsystem No need to save preds to event_subsystem, because it's not used. Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A83C.1030005@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 39 ++++++-------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf46..b9aae72d13db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,17 +420,7 @@ EXPORT_SYMBOL_GPL(init_preds); static void filter_free_subsystem_preds(struct event_subsystem *system) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; - int i; - - if (filter->n_preds) { - for (i = 0; i < filter->n_preds; i++) - filter_free_pred(filter->preds[i]); - kfree(filter->preds); - filter->preds = NULL; - filter->n_preds = 0; - } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -607,26 +597,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct filter_pred *pred, char *filter_string) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; int err = 0; - if (!filter->preds) { - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - } - - if (filter->n_preds == MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -1029,12 +1002,12 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; @@ -1048,12 +1021,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; -- cgit From dc82ec98a4727fd51b77e92d05fe7d2db3dcc11c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:22:22 +0800 Subject: tracing/filter: Remove empty subsystem and its directory Remove empty subsystem and its directory when module unload. Before patch: # rmmod trace-events-sample.ko # ls sample enable filter After patch: # rmmod trace-events-sample.ko # ls sample ls: cannot access sample: No such file or directory Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A8BE.9010707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 52eb0d8dcd75..94305c7bc11c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,6 +757,7 @@ struct event_subsystem { const char *name; struct dentry *entry; void *filter; + int nr_events; }; struct filter_pred; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fecac1314cbe..90cf9360e140 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -851,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + if (strcmp(system->name, name) == 0) { + system->nr_events++; return system->entry; + } } /* need to create new entry */ @@ -871,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } + system->nr_events = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -905,6 +908,32 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1079,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); + remove_subsystem_dir(call->system); } } -- cgit From 68baafcfc46074c4bb4e4c3115c2c76a8a85f37d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:29 +0200 Subject: tracing/function-graph-tracer: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1247107590-6428-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb53..abf7c4ae2c8b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -565,11 +565,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "();\n"); + ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -612,11 +608,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit From 6a167c655858cbec4175532fd00417661c87f149 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:30 +0200 Subject: tracing/kmemtrace: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan Cc: Pekka Enberg Cc: Eduard - Gabriel Munteanu LKML-Reference: <1247107590-6428-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 74903b62bcb6..2f6fa47d410c 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -389,19 +389,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Node */ - ret = trace_seq_printf(s, "%4d ", entry->node); + /* Node and call site*/ + ret = trace_seq_printf(s, "%4d %pf\n", entry->node, + (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } @@ -447,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Skip node */ - ret = trace_seq_printf(s, " "); + /* Skip node and print call site*/ + ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } -- cgit From 80098c200e2ee3b4c86a9d1e156dbcd05380e08f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Jul 2009 16:15:04 +0800 Subject: kmemtrace: Rename some functions So we have: - kmemtrace_print_alloc/free() for kmemtrace default output - kmemtrace_print_alloc/free_user() for binary output used by kmemtrace-user. Suggested-by: Eduard - Gabriel Munteanu Signed-off-by: Li Zefan Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B288.70505@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 2f6fa47d410c..dda53ccf749b 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,7 +239,7 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -259,7 +259,7 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags) +kmemtrace_print_free(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -277,7 +277,7 @@ kmemtrace_print_free_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -311,7 +311,7 @@ kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -467,14 +467,14 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) static struct trace_event kmem_trace_alloc = { .type = TRACE_KMEM_ALLOC, - .trace = kmemtrace_print_alloc_user, - .binary = kmemtrace_print_alloc_user_bin, + .trace = kmemtrace_print_alloc, + .binary = kmemtrace_print_alloc_user, }; static struct trace_event kmem_trace_free = { .type = TRACE_KMEM_FREE, - .trace = kmemtrace_print_free_user, - .binary = kmemtrace_print_free_user_bin, + .trace = kmemtrace_print_free, + .binary = kmemtrace_print_free_user, }; static struct tracer kmem_tracer __read_mostly = { -- cgit From d8ea37d5de58d35a39d0b4e7d209751aaa1b8174 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:18 +0800 Subject: tracing/stat: Add stat_release() callback Add stat_release() callback to struct tracer_stat, so a stat tracer can release it's entries after the stat file has been read out. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B16A.6020708@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 7 +++++-- kernel/trace/trace_stat.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e493342..f069461f10bd 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -49,7 +49,8 @@ static struct dentry *stat_dir; * but it will at least advance closer to the next one * to be released. */ -static struct rb_node *release_next(struct rb_node *node) +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { struct stat_node *snode; struct rb_node *parent = rb_parent(node); @@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node) parent->rb_right = NULL; snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); return parent; @@ -78,7 +81,7 @@ static void reset_stat_session(struct stat_session *session) struct rb_node *node = session->stat_root.rb_node; while (node) - node = release_next(node); + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index f3546a2cd826..8f03914b9a6a 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -18,6 +18,8 @@ struct tracer_stat { int (*stat_cmp)(void *p1, void *p2); /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); /* Print the headers of your stat entries */ int (*stat_headers)(struct seq_file *s); }; -- cgit From a35780005eb256eb5ec83ffcc802967295887a45 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:23 +0800 Subject: tracing/workqueues: Add refcnt to struct cpu_workqueue_stats The stat entries can be freed when the stat file is being read. The worse is, the ptr can be freed immediately after it's returned from workqueue_stat_start/next(). Add a refcnt to struct cpu_workqueue_stats to avoid use-after-free. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A51B16F.6010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 97fcea4acce1..40cafb07dffd 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace_stat.h" #include "trace.h" @@ -16,6 +17,7 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; + struct kref kref; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -39,6 +41,11 @@ struct workqueue_global_stats { static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + /* Insertion of a work */ static void probe_workqueue_insertion(struct task_struct *wq_thread, @@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) return; } INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); cws->cpu = cpu; - cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); @@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) list) { if (node->pid == wq_thread->pid) { list_del(&node->list); - kfree(node); + kref_put(&node->kref, cpu_workqueue_stat_free); goto found; } } @@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); @@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace) static void *workqueue_stat_next(void *prev, int idx) { struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; int cpu = prev_cws->cpu; unsigned long flags; - void *ret = NULL; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { @@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx) return NULL; } while (!(ret = workqueue_stat_start_cpu(cpu))); return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, - list); + return ret; } static int workqueue_stat_show(struct seq_file *s, void *p) @@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) return 0; } +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); @@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = { .stat_start = workqueue_stat_start, .stat_next = workqueue_stat_next, .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, .stat_headers = workqueue_stat_headers }; -- cgit From da706d8bc833e7153622435560422e653bdb2e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Jul 2009 16:27:30 +0800 Subject: ring_buffer: Fix warning while ignoring cmpxchg return value kernel/trace/ring_buffer.c: In function 'rb_tail_page_update': kernel/trace/ring_buffer.c:849: warning: value computed is not used kernel/trace/ring_buffer.c:850: warning: value computed is not used Add "(void)"s to fix this warning, because we don't need here to handle the fail case of cmpxchg, it's fine if an interrupt already did the job. Changed from V1: Add a comment(which is written by Steven) for it. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e648ba4f70e0..51633d74a21e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -845,9 +845,14 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. */ - local_cmpxchg(&next_page->write, old_write, val); - local_cmpxchg(&next_page->entries, old_entries, eval); + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. -- cgit From 64fbcd162819bddaf0d99e78b16371b655aa5dee Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 15 Jul 2009 12:32:15 +0800 Subject: tracing/function: Simplify __ftrace_replace_code() Rewrite the __ftrace_replace_code() function, simplify it, but don't change the code's logic. First, we get the state we want to set, if the record has the same state, then do nothing, otherwise enable/disable it. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 72 +++++++++++++-------------------------------------- 1 file changed, 18 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bce9e01a29c8..217caeca71cd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1017,71 +1017,35 @@ static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; - unsigned long ip, fl; + unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; - ip = rec->ip; - /* - * If this record is not to be traced and - * it is not enabled then do nothing. + * If this record is not to be traced or we want to disable it, + * then disable it. * - * If this record is not to be traced and - * it is enabled then disable it. + * If we want to enable it and filtering is off, then enable it. * + * If we want to enable it and filtering is on, enable it only if + * it's filtered */ - if (rec->flags & FTRACE_FL_NOTRACE) { - if (rec->flags & FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - return 0; - - } else if (ftrace_filtered && enable) { - /* - * Filtering is on: - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - - /* Record is filtered and enabled, do nothing */ - if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) - return 0; - - /* Record is not filtered or enabled, do nothing */ - if (!fl) - return 0; - - /* Record is not filtered but enabled, disable it */ - if (fl == FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - /* Otherwise record is filtered but not enabled, enable it */ - rec->flags |= FTRACE_FL_ENABLED; - } else { - /* Disable or not filtered */ - - if (enable) { - /* if record is enabled, do nothing */ - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - - rec->flags |= FTRACE_FL_ENABLED; - - } else { + if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + flag = FTRACE_FL_ENABLED; + } - /* if record is not enabled, do nothing */ - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); } - if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, ftrace_addr); - else - return ftrace_make_nop(NULL, rec, ftrace_addr); + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) -- cgit From 79173bf556417a737e9d2e096e0788452ec30a61 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:17:11 +0800 Subject: tracing/trace_stack: Cleanup for trace_lookup_stack() We can directly use %pF input format instead of sprint_symbol() and %s input format. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stack.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af910124..a4dc8d9ad1b1 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -234,15 +234,8 @@ static void t_stop(struct seq_file *m, void *p) static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - sprint_symbol(str, addr); - - return seq_printf(m, "%s\n", str); -#else - return seq_printf(m, "%p\n", (void*)addr); -#endif + return seq_printf(m, "%pF\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit From 6f2f3cf00ee32f75ba007a46bab88a54d68a5deb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:21:08 +0800 Subject: tracing/function: Cleanup for function tracer We can directly use %pf input format instead of kallsyms_lookup() and %s input format Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 17 +++-------------- kernel/trace/trace_functions.c | 4 +--- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 217caeca71cd..80a97a51442d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1403,18 +1403,13 @@ static int t_hash_show(struct seq_file *m, void *v) { struct ftrace_func_probe *rec; struct hlist_node *hnd = v; - char str[KSYM_SYMBOL_LEN]; rec = hlist_entry(hnd, struct ftrace_func_probe, node); if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); - - kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); - seq_printf(m, "%s", str); + seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1512,7 +1507,6 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; if (iter->flags & FTRACE_ITER_HASH) return t_hash_show(m, v); @@ -1525,9 +1519,7 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", (void *)rec->ip); return 0; } @@ -2508,7 +2500,6 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { unsigned long *ptr = v; - char str[KSYM_SYMBOL_LEN]; if (!ptr) return 0; @@ -2518,9 +2509,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - kallsyms_lookup(*ptr, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", v); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144bff21..b53dc994dfb6 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -288,11 +288,9 @@ static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - char str[KSYM_SYMBOL_LEN]; long count = (long)data; - kallsyms_lookup(ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); + seq_printf(m, "%pf:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); -- cgit From 566b0aaf798a0dddfc455d1a5b05c424c6686c65 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Thu, 16 Jul 2009 21:44:26 +0200 Subject: tracing: Remove unused fields/variables Signed-off-by: Jiri Olsa Cc: rostedt@goodmis.org LKML-Reference: <1247773468-11594-2-git-send-email-jolsa@redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.c | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6227dc806377..24e3ff53b24b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1339,7 +1339,6 @@ struct ftrace_iterator { unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; unsigned buffer_idx; - unsigned filtered; }; static void * @@ -2268,7 +2267,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } if (isspace(ch)) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ret = ftrace_process_regex(iter->buffer, iter->buffer_idx, enable); @@ -2399,7 +2397,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) iter = file->private_data; if (iter->buffer_idx) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ftrace_match_records(iter->buffer, iter->buffer_idx, enable); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60a49488b791..e30e6b1dbd4e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4265,7 +4265,6 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { - struct trace_array_cpu *data; int ring_buf_size; int i; int ret = -ENOMEM; @@ -4315,7 +4314,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); } -- cgit From 7d536cb3fb9993bdcd5a2fbaa6b0670ded4e101c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 16 Jul 2009 10:54:02 +0800 Subject: tracing/events: record the size of dynamic arrays When a dynamic array is defined, we add __data_loc_foo in trace_entry to record the offset of the array, but the size of the array is not recorded, which causes 2 problems: - the event filter just compares the first 2 chars of the strings. - parsers can't parse dynamic arrays. So we encode the size of each dynamic array in the higher 16 bits of __data_loc_foo, while the offset is in lower 16 bits. Signed-off-by: Li Zefan LKML-Reference: <4A5E964A.9000403@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b9aae72d13db..1c80ef702b83 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - unsigned short str_loc = *(unsigned short *)(event + pred->offset); + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = strncmp(addr, pred->str_val, str_len); match = (!cmp) ^ pred->not; -- cgit From ff4e9da2330beb8d64498a513d3f9694e941b01a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 22 Jun 2009 10:33:07 +0800 Subject: tracing: cleanup for tracing_trace_options_read() '\n' is already appended, and what we need is just an extra space for the '\0'. Signed-off-by: Xiao Guangrong LKML-Reference: <4A3EED63.3090908@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e30e6b1dbd4e..38a4a3ee749d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2256,8 +2256,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and newline */ } - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); + /* +1 for \0 */ + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) { mutex_unlock(&trace_types_lock); return -ENOMEM; @@ -2280,7 +2280,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 2); + WARN_ON(r >= len + 1); r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit From 1f9963cbb0280e0cd554161e00f1a0eeddbf1ae1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 20 Jul 2009 10:20:53 +0800 Subject: tracing/filters: improve subsystem filter Currently a subsystem filter should be applicable to all events under the subsystem, and if it failed, all the event filters will be cleared. Those behaviors make subsys filter much less useful: # echo 'vec == 1' > irq/softirq_entry/filter # echo 'irq == 5' > irq/filter bash: echo: write error: Invalid argument # cat irq/softirq_entry/filter none I'd expect it set the filter for irq_handler_entry/exit, and not touch softirq_entry/exit. The basic idea is, try to see if the filter can be applied to which events, and then just apply to the those events: # echo 'vec == 1' > softirq_entry/filter # echo 'irq == 5' > filter # cat irq_handler_entry/filter irq == 5 # cat softirq_entry/filter vec == 1 Changelog for v2: - do some cleanups to address Frederic's comments. Inspired-by: Steven Rostedt Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker LKML-Reference: <4A63D485.7030703@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 124 ++++++++++++++++++++++++------------- 2 files changed, 84 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 94305c7bc11c..758b0dbed552 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -750,13 +750,14 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; + bool no_reset; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - void *filter; + struct event_filter *filter; int nr_events; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1c80ef702b83..27c2dbea3710 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,7 +420,14 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void filter_free_subsystem_preds(struct event_subsystem *system) +enum { + FILTER_DISABLE_ALL, + FILTER_INIT_NO_RESET, + FILTER_SKIP_NO_RESET, +}; + +static void filter_free_subsystem_preds(struct event_subsystem *system, + int flag) { struct ftrace_event_call *call; @@ -428,6 +435,14 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) if (!call->define_fields) continue; + if (flag == FILTER_INIT_NO_RESET) { + call->filter->no_reset = false; + continue; + } + + if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) + continue; + if (!strcmp(call->system, system->name)) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -529,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + bool dry_run) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -541,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_AND) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_and); + fn = filter_pred_and; + goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_or); + fn = filter_pred_or; + goto add_pred_fn; } field = find_event_field(call, pred->field_name); @@ -567,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps, else fn = filter_pred_strloc; pred->str_len = field->size; - if (pred->op == OP_NE) - pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); @@ -580,27 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } pred->val = val; - } - fn = select_comparison_fn(pred->op, field->size, field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } } if (pred->op == OP_NE) pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, pred, fn); + return 0; } static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct event_subsystem *system, struct filter_pred *pred, - char *filter_string) + char *filter_string, + bool dry_run) { struct ftrace_event_call *call; int err = 0; + bool fail = true; list_for_each_entry(call, &ftrace_events, list) { @@ -610,16 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (strcmp(call->system, system->name)) continue; - err = filter_add_pred(ps, call, pred); - if (err) { - filter_free_subsystem_preds(system); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - goto out; - } - replace_filter_string(call->filter, filter_string); + if (call->filter->no_reset) + continue; + + err = filter_add_pred(ps, call, pred, dry_run); + if (err) + call->filter->no_reset = true; + else + fail = false; + + if (!dry_run) + replace_filter_string(call->filter, filter_string); } -out: - return err; + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return err; + } + return 0; } static void parse_init(struct filter_parse_state *ps, @@ -978,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps) static int replace_preds(struct event_subsystem *system, struct ftrace_event_call *call, struct filter_parse_state *ps, - char *filter_string) + char *filter_string, + bool dry_run) { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; int err; + int n_preds = 0; err = check_preds(ps); if (err) @@ -1002,19 +1033,14 @@ static int replace_preds(struct event_subsystem *system, continue; } + if (n_preds++ == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) - err = filter_add_pred(ps, call, pred); - else - err = filter_add_subsystem_pred(ps, system, - pred, filter_string); - filter_free_pred(pred); - if (err) - return err; - - operand1 = operand2 = NULL; - continue; + goto add_pred; } if (!operand1 || !operand2) { @@ -1023,11 +1049,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); +add_pred: if (call) - err = filter_add_pred(ps, call, pred); + err = filter_add_pred(ps, call, pred, false); else err = filter_add_subsystem_pred(ps, system, pred, - filter_string); + filter_string, dry_run); filter_free_pred(pred); if (err) return err; @@ -1068,7 +1095,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string); + err = replace_preds(NULL, call, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); @@ -1092,7 +1119,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); mutex_unlock(&event_mutex); return 0; @@ -1103,7 +1130,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); parse_init(ps, filter_ops, filter_string); @@ -1113,9 +1139,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - err = replace_preds(system, NULL, ps, filter_string); - if (err) + filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); + + /* try to see the filter can be applied to which events */ + err = replace_preds(system, NULL, ps, filter_string, true); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + + /* really apply the filter to the events */ + err = replace_preds(system, NULL, ps, filter_string, false); + if (err) { append_filter_err(ps, system->filter); + filter_free_subsystem_preds(system, 2); + } out: filter_opstack_clear(ps); -- cgit From 0c9e6f639aed490202bbc79214f4495cf4bfde58 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:26:06 +0800 Subject: tracing: Simplify print_graph_cpu() print_graph_cpu() is little over-designed. And "log10_all" may be wrong when there are holes in cpu_online_mask: the max online cpu id > cpumask_weight(cpu_online_mask) So change it by using a static column length for the cpu matching nr_cpu_ids number of decimal characters. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <4A6EEE5E.2000001@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index abf7c4ae2c8b..e30472da15d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -183,43 +183,19 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } -static inline int log10_cpu(int nb) -{ - if (nb / 100) - return 3; - if (nb / 10) - return 2; - return 1; -} +static int max_bytes_for_cpu; static enum print_line_t print_graph_cpu(struct trace_seq *s, int cpu) { - int i; int ret; - int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " "); - - /* - * Tricky - we space the CPU field according to the max - * number of online CPUs. On a 2-cpu system it would take - * a maximum of 1 digit - on a 128 cpu system it would - * take up to 3 digits: - */ - for (i = 0; i < log10_all - log10_this; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "%d) ", cpu); + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -919,6 +895,8 @@ static struct tracer graph_trace __read_mostly = { static __init int init_graph_trace(void) { + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + return register_tracer(&graph_trace); } -- cgit From 5e5bf483986ad86ad25f25eec5299c86eb2d1c57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:11:12 +0200 Subject: tracing/core: Turn ftrace_cpu_disabled into a global var In order to prepare the moving of the function graph tracer insertion helpers from trace.c to trace_functions_graph.c, we need to export the ftrace_cpu_disabled variable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 38a4a3ee749d..b6211d604131 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 758b0dbed552..c7e92732982d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -519,6 +519,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, -- cgit From c0a0d0d3f65284c71115a9bb1ed801ee33eeb552 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:51:13 +0200 Subject: tracing/core: Make the stack entry helpers global Make the stacktrace event insertion helpers globals. This has two effects: - Prepare for moving the sched events insertion helpers to the sched switch tracer file. - Move some ifdef outside function definitions Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++---------------- kernel/trace/trace.h | 28 +++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6211d604131..d6059a493e7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -866,10 +866,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, int skip, int pc); -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc); static inline void __trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, @@ -1003,11 +999,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, ip, parent_ip, flags, pc); } +#ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; @@ -1028,12 +1024,10 @@ static void __ftrace_trace_stack(struct trace_array *tr, save_stack_trace(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; @@ -1041,17 +1035,14 @@ static void ftrace_trace_stack(struct trace_array *tr, __ftrace_trace_stack(tr, flags, skip, pc); } -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { __ftrace_trace_stack(tr, flags, skip, pc); } -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc) +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -1076,7 +1067,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } #ifdef UNUSED @@ -1086,6 +1076,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) } #endif /* UNUSED */ +#endif /* CONFIG_STACKTRACE */ + static void ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c7e92732982d..116524d62366 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -489,9 +489,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc); +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc); + +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, + int pc); + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct trace_array *tr, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct trace_array *tr, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ extern cycle_t ftrace_now(int cpu); -- cgit From 82e04af498a85ba425efe77580b7ba08234411df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:00:29 +0200 Subject: tracing: Move sched event insertion helpers in the sched switch tracer file The sched events helpers which insert the sched switch and wakeup events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the sched switch tracer file. Then move them to trace_functions.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 56 -------------------------------------- kernel/trace/trace_sched_switch.c | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6059a493e7f..1b73acb40e56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1105,62 +1105,6 @@ __trace_special(void *__tr, void *__data, ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); } -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); -} - void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a98106dd979c..e1285d7b5488 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,6 +20,34 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static int sched_stopped; + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); +} + static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) @@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_restore(flags); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); +} + static void probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { -- cgit From 1a0799a8fef5acc6503f9c5e79b2cd003317826c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:59:58 +0200 Subject: tracing/function-graph-tracer: Move graph event insertion helpers in the graph tracer file The function graph events helpers which insert the function entry and return events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the function graph tracer file. Then move them to trace_functions_graph.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 110 ------------------------------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 122 ++++++++++++++++++++++++++++++++++- kernel/trace/trace_selftest.c | 1 + 4 files changed, 121 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b73acb40e56..0cfd1a62def1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -942,54 +942,6 @@ trace_function(struct trace_array *tr, ring_buffer_unlock_commit(tr->buffer, event); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return 0; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); - - return 1; -} - -static void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); -} -#endif - void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -1129,68 +1081,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - if (!ftrace_graph_addr(trace->func)) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - if (!trace->depth) - clear_tsk_trace_graph(current); - atomic_dec(&data->disabled); - local_irq_restore(flags); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - /** * trace_vbprintk - write binary msg to tracing buffer * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 116524d62366..9301f1263c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -471,6 +471,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e30472da15d5..f97244a41a4f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -/* pid on the last trace processed */ +static struct trace_array *graph_array; /* Add a function return address to the trace stack on thread info.*/ @@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +static int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return 0; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); + + return 1; +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (unlikely(!tr)) + return 0; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static int graph_trace_init(struct trace_array *tr) { - int ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + int ret; + + graph_array = tr; + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -177,6 +288,11 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd6485bdd7..d2cdbabb4ead 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(tr); + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); if (ret) { -- cgit From a2ca5e03b6a5a1d401062f0a7f78888cf9e5e3b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 Aug 2009 07:32:21 +0200 Subject: tracing/events: Only define remove_subsystem_dir() if CONFIG_MODULES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we disable modules, we get the following warning in ftrace events file: kernel/trace/trace_events.c:912: attention : ‘remove_subsystem_dir’ defined but not used remove_subystem_dir() is useless if !CONFIG_MODULES, then move it to the appropriate #ifdef section of trace_events.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace_events.c | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 90cf9360e140..70ecb7653b46 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -908,32 +908,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; - - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - struct event_filter *filter = system->filter; - - debugfs_remove_recursive(system->entry); - list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); - } - break; - } - } -} - static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1018,6 +992,32 @@ struct ftrace_module_file_ops { struct file_operations filter; }; +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { -- cgit From fb82ad719831db58e9baa4c67015aae3fe27e7e3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:36 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. This updates his patch for the same problem in the tracing tree (which has changed this code quite substantially). Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746576.6453.30.camel@tropicana> Signed-off-by: Ingo Molnar The original report: create_logical_pred() could sometimes return NULL. It's a static checker complaining rather than problems at runtime... --- kernel/trace/trace_events_filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27c2dbea3710..490337abed75 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1050,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, pred = create_pred(elt->op, operand1, operand2); add_pred: + if (!pred) + return -ENOMEM; if (call) err = filter_add_pred(ps, call, pred, false); else -- cgit From 7770841e63730d62928b0879498064e9614b2ce0 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 7 Aug 2009 18:53:21 +0800 Subject: tracing: Rename set_tracer_flags()'s local variable trace_flags set_tracer_flags() have a local variable named trace_flags which has the same name than a global one in the same scope. This leads to confusion, using tracer_flags should be better by its meaning. Changelog: v1->v2: Simplified another patch in this patchset, no change in this patch. Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e793cda91dd3..8ac204360a39 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2118,23 +2118,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, /* Try to assign a tracer specific option */ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { - struct tracer_flags *trace_flags = trace->flags; + struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int ret = 0, i = 0; int len; - for (i = 0; trace_flags->opts[i].name; i++) { - opts = &trace_flags->opts[i]; + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; len = strlen(opts->name); if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(trace_flags->val, + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); break; } } /* Not found */ - if (!trace_flags->opts[i].name) + if (!tracer_flags->opts[i].name) return -EINVAL; /* Refused to handle */ @@ -2142,9 +2142,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return ret; if (neg) - trace_flags->val &= ~opts->bit; + tracer_flags->val &= ~opts->bit; else - trace_flags->val |= opts->bit; + tracer_flags->val |= opts->bit; return 0; } -- cgit From 066e0378c23f0a3db730893f6a041e4a3922a385 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:23 -0400 Subject: tracing: Call arch_init_ftrace_syscalls at boot Call arch_init_ftrace_syscalls at boot, so we can determine early the set of syscalls for the syscall trace events. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e579645ac86..08aed439feaf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,7 +106,6 @@ void start_ftrace_syscalls(void) if (++refcount != 1) goto unlock; - arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { -- cgit From a871bd33a6c0bc86fb47cd02ea2650dd43d3d95f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:31 -0400 Subject: tracing: Add syscall tracepoints add two tracepoints in syscall exit and entry path, conditioned on TIF_SYSCALL_FTRACE. Supports the syscall trace event code. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/tracepoint.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1ef5d3a601c7..070a42bb8920 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -24,6 +24,7 @@ #include #include #include +#include extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -577,3 +578,40 @@ static int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ + +static DEFINE_MUTEX(regfunc_mutex); +static int sys_tracepoint_refcount; + +void syscall_regfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + sys_tracepoint_refcount++; + mutex_unlock(®func_mutex); +} + +void syscall_unregfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + sys_tracepoint_refcount--; + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + mutex_unlock(®func_mutex); +} -- cgit From f744bd576a827c5b02e756b81fc2578edf8179b8 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:39 -0400 Subject: tracing: Raw_init() bailout in trace event register fail case Allow the return value of raw_init() trace event callback to bail us out of creating a trace event file, in case we fail to register our event. Also, we plan to return -ENOSYS for syscall events that don't match any syscalls listed in our arch tracing syscall table, we don't want to warn in that case, we just want this event to be invisible in debugfs and ignored. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e0cbede96783..f95f8470dd38 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -925,15 +925,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); - if (call->raw_init) { - ret = call->raw_init(); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } - } - call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " @@ -1058,6 +1049,7 @@ static void trace_module_add_events(struct module *mod) struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; + int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1073,7 +1065,15 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; - + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } /* * This module has events, create file ops for this module * if not already done. @@ -1225,6 +1225,15 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "point events/%s\n", call->name); + continue; + } + } list_add(&call->list, &ftrace_events); event_create_dir(call, d_events, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, -- cgit From 69fd4f0eb2ececbf8ade55e31a933e174965745e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:44 -0400 Subject: tracing: Add ftrace_event_call void * 'data' field add an optional void * pointer to 'ftrace_event_call' that is passed in for regfunc and unregfunc. This prepares for syscall tracepoints creation by passing the name of the syscall we want to trace and then retrieve its number through our arch syscall table. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f95f8470dd38..1d289e2d6693 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -86,14 +86,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(); + call->unregfunc(call->data); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(); + call->regfunc(call->data); } break; } -- cgit From fb34a08c3469b2be9eae626ccb96476b4687b810 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:47 -0400 Subject: tracing: Add trace events for each syscall entry/exit Layer Frederic's syscall tracer on tracepoints. We create trace events via hooking into the SYSCALL_DEFINE macros. This allows us to individually toggle syscall entry and exit points on/off. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 183 +++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 08aed439feaf..c7ae25ee95d8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,15 +1,16 @@ #include #include +#include #include #include "trace_output.h" #include "trace.h" -/* Keep a counter of the syscall tracing users */ -static int refcount; - -/* Prevent from races on thread flags toggling */ static DEFINE_MUTEX(syscall_trace_lock); +static int sys_refcount_enter; +static int sys_refcount_exit; +static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); /* Option to display the parameters types */ enum { @@ -95,53 +96,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -void start_ftrace_syscalls(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - mutex_lock(&syscall_trace_lock); - - /* Don't enable the flag on the tasks twice */ - if (++refcount != 1) - goto unlock; - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); - - read_unlock_irqrestore(&tasklist_lock, flags); - -unlock: - mutex_unlock(&syscall_trace_lock); -} - -void stop_ftrace_syscalls(void) -{ - unsigned long flags; - struct task_struct *g, *t; - - mutex_lock(&syscall_trace_lock); - - /* There are perhaps still some users */ - if (--refcount) - goto unlock; - - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); - - read_unlock_irqrestore(&tasklist_lock, flags); - -unlock: - mutex_unlock(&syscall_trace_lock); -} - -void ftrace_syscall_enter(struct pt_regs *regs) +void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -150,6 +105,8 @@ void ftrace_syscall_enter(struct pt_regs *regs) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_enter_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -170,7 +127,7 @@ void ftrace_syscall_enter(struct pt_regs *regs) trace_wake_up(); } -void ftrace_syscall_exit(struct pt_regs *regs) +void ftrace_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -178,6 +135,8 @@ void ftrace_syscall_exit(struct pt_regs *regs) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_exit_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -196,54 +155,94 @@ void ftrace_syscall_exit(struct pt_regs *regs) trace_wake_up(); } -static int init_syscall_tracer(struct trace_array *tr) +int reg_event_syscall_enter(void *ptr) { - start_ftrace_syscalls(); - - return 0; + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_enter) + ret = register_trace_syscall_enter(ftrace_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_enter_syscalls); + sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; } -static void reset_syscall_tracer(struct trace_array *tr) +void unreg_event_syscall_enter(void *ptr) { - stop_ftrace_syscalls(); - tracing_reset_online_cpus(tr); -} - -static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, - .trace = print_syscall_enter, -}; - -static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, - .trace = print_syscall_exit, -}; + int num; + char *name; -static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", - .init = init_syscall_tracer, - .reset = reset_syscall_tracer, - .flags = &syscalls_flags, -}; + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_enter--; + clear_bit(num, enabled_enter_syscalls); + if (!sys_refcount_enter) + unregister_trace_syscall_enter(ftrace_syscall_enter); + mutex_unlock(&syscall_trace_lock); +} -__init int register_ftrace_syscalls(void) +int reg_event_syscall_exit(void *ptr) { - int ret; - - ret = register_ftrace_event(&syscall_enter_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_enter_event.type); - WARN_ON_ONCE(1); + int ret = 0; + int num; + char *name; + + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_exit) + ret = register_trace_syscall_exit(ftrace_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_exit_syscalls); + sys_refcount_exit++; } + mutex_unlock(&syscall_trace_lock); + return ret; +} - ret = register_ftrace_event(&syscall_exit_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_exit_event.type); - WARN_ON_ONCE(1); - } +void unreg_event_syscall_exit(void *ptr) +{ + int num; + char *name; - return register_tracer(&syscall_tracer); + name = (char *)ptr; + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_exit--; + clear_bit(num, enabled_exit_syscalls); + if (!sys_refcount_exit) + unregister_trace_syscall_exit(ftrace_syscall_exit); + mutex_unlock(&syscall_trace_lock); } -device_initcall(register_ftrace_syscalls); + +struct trace_event event_syscall_enter = { + .trace = print_syscall_enter, + .type = TRACE_SYSCALL_ENTER +}; + +struct trace_event event_syscall_exit = { + .trace = print_syscall_exit, + .type = TRACE_SYSCALL_EXIT +}; -- cgit From 64c12e0444fcc6b75eb49144ba46d43dbdc6bc8f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:53 -0400 Subject: tracing: Add individual syscalls tracepoint id support The current state of syscalls tracepoints generates only one event id for every syscall events. This patch associates an id with each syscall trace event, so that we can identify each syscall trace event using the 'perf' tool. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.h | 6 ------ kernel/trace/trace_syscalls.c | 26 ++++++++++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d682357e4b1f..300ef788c976 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,8 +34,6 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, - TRACE_SYSCALL_ENTER, - TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, @@ -319,10 +317,6 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct syscall_trace_enter, \ - TRACE_SYSCALL_ENTER); \ - IF_ASSIGN(var, ent, struct syscall_trace_exit, \ - TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c7ae25ee95d8..e58a9c11ba85 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -36,14 +36,18 @@ print_syscall_enter(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int i, ret, syscall; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) goto end; + if (entry->enter_id != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + ret = trace_seq_printf(s, "%s(", entry->name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -78,16 +82,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int ret; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) { trace_seq_printf(s, "\n"); return TRACE_TYPE_HANDLED; } + if (entry->exit_id != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); if (!ret) @@ -114,7 +122,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, + event = trace_current_buffer_lock_reserve(sys_data->enter_id, size, 0, 0); if (!event) return; @@ -142,7 +150,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + event = trace_current_buffer_lock_reserve(sys_data->exit_id, sizeof(*entry), 0, 0); if (!event) return; @@ -239,10 +247,8 @@ void unreg_event_syscall_exit(void *ptr) struct trace_event event_syscall_enter = { .trace = print_syscall_enter, - .type = TRACE_SYSCALL_ENTER }; struct trace_event event_syscall_exit = { .trace = print_syscall_exit, - .type = TRACE_SYSCALL_EXIT }; -- cgit From f4b5ffccc83c82947f5d9f15d6f1b6edb1b71cd7 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:53:02 -0400 Subject: tracing: Add perf counter support for syscalls tracing The perf counter support is automated for usual trace events. But we have to define specific callbacks for this to handle syscalls trace events Make 'perf stat -e syscalls:sys_enter_blah' work with syscall style tracepoints. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 121 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e58a9c11ba85..f4eaec3d559a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "trace_output.h" @@ -252,3 +253,123 @@ struct trace_event event_syscall_enter = { struct trace_event event_syscall_exit = { .trace = print_syscall_exit, }; + +#ifdef CONFIG_EVENT_PROFILE +static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); +static int sys_prof_refcount_enter; +static int sys_prof_refcount_exit; + +static void prof_syscall_enter(struct pt_regs *regs, long id) +{ + struct syscall_metadata *sys_data; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + perf_tpcounter_event(sys_data->enter_id, 0, 1, NULL, 0); +} + +int reg_prof_syscall_enter(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_enter) + ret = register_trace_syscall_enter(prof_syscall_enter); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_enter_syscalls); + sys_prof_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_prof_syscall_enter(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_enter--; + clear_bit(num, enabled_prof_enter_syscalls); + if (!sys_prof_refcount_enter) + unregister_trace_syscall_enter(prof_syscall_enter); + mutex_unlock(&syscall_trace_lock); +} + +static void prof_syscall_exit(struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + perf_tpcounter_event(sys_data->exit_id, 0, 1, NULL, 0); +} + +int reg_prof_syscall_exit(char *name) +{ + int ret = 0; + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return -ENOSYS; + + mutex_lock(&syscall_trace_lock); + if (!sys_prof_refcount_exit) + ret = register_trace_syscall_exit(prof_syscall_exit); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_prof_exit_syscalls); + sys_prof_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_prof_syscall_exit(char *name) +{ + int num; + + num = syscall_name_to_nr(name); + if (num < 0 || num >= FTRACE_SYSCALL_MAX) + return; + + mutex_lock(&syscall_trace_lock); + sys_prof_refcount_exit--; + clear_bit(num, enabled_prof_exit_syscalls); + if (!sys_prof_refcount_exit) + unregister_trace_syscall_exit(prof_syscall_exit); + mutex_unlock(&syscall_trace_lock); +} + +#endif + + -- cgit From e8f9f4d79a677f55c8ec3acbe87b33a87e2df0de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 17:42:52 +0200 Subject: tracing: Add ftrace event call parameter to its field descriptor handler Add the struct ftrace_event_call as a parameter of its show_format() callback. This way we can use it from the syscall trace events to retrieve the syscall name from the ftrace event call parameter and describe its fields using the syscalls metadata. Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron --- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_export.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d289e2d6693..b568ade8f453 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -576,7 +576,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_printf(s, "format:\n"); trace_write_header(s); - r = call->show_format(s); + r = call->show_format(call, s); if (!r) { /* * ug! The format output is bigger than a PAGE!! diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d06cf898dc86..956d4bc675e5 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -60,7 +60,8 @@ extern void __bad_type_size(void); #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct args field; \ int ret; \ @@ -76,7 +77,8 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ tpfmt) \ static int \ -ftrace_format_##call(struct trace_seq *s) \ +ftrace_format_##call(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ struct args field; \ int ret; \ -- cgit From dc4ddb4c0b7348f1c9759ae8a9e7d734dc1cda82 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 19:03:54 +0200 Subject: tracing: Add fields format definition for syscall events Define the format of the syscall trace fields to parse the binary values from a raw trace using the syscall events "format" file. This is defined dynamically using the syscalls metadata. It prepares the export of syscall event raw records to perf counters. Example: $ cat /debug/tracing/events/syscalls/sys_enter_sched_getparam/format name: sys_enter_sched_getparam ID: 39 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field:pid_t pid; offset:12; size:8; field:struct sched_param * param; offset:20; size:8; print fmt: "pid: 0x%08lx, param: 0x%08lx", ((unsigned long)(REC->pid)), ((unsigned long)(REC->param)) Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron --- kernel/trace/trace_syscalls.c | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f4eaec3d559a..9ee6386cf842 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -105,6 +105,52 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } +int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) +{ + int i; + int nr; + int ret = 0; + struct syscall_metadata *entry; + int offset = sizeof(struct trace_entry); + + nr = syscall_name_to_nr((char *)call->data); + entry = syscall_nr_to_meta(nr); + + if (!entry) + return ret; + + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], + entry->args[i]); + if (!ret) + return 0; + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%lu;\n", offset, + sizeof(unsigned long)); + if (!ret) + return 0; + offset += sizeof(unsigned long); + } + + trace_seq_printf(s, "\nprint fmt: \""); + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "%s: 0x%%0%lulx%s", entry->args[i], + sizeof(unsigned long), + i == entry->nb_args - 1 ? "\", " : ", "); + if (!ret) + return 0; + } + + for (i = 0; i < entry->nb_args; i++) { + ret = trace_seq_printf(s, "((unsigned long)(REC->%s))%s", + entry->args[i], + i == entry->nb_args - 1 ? "\n" : ", "); + if (!ret) + return 0; + } + + return ret; +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; -- cgit From 19007a67a64f9b3cbbd7024f972654ebf14daade Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2009 20:22:53 +0200 Subject: tracing: Support for syscall events raw records in perfcounters This bring the support for raw syscall events in perfcounters. The arguments or exit value are saved as a raw sample using the PERF_SAMPLE_RAW attribute in a perf counter. Example (for now you must explicitly set the PERF_SAMPLE_RAW flag in perf record): perf record -e syscalls:sys_enter_open -f -F 1 -a perf report -D 0x2cbb8 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 02 00 50 00 20 e9 39 ab 0a 7f 00 00 ......P. .9.... . 0010: bc 14 00 00 bc 14 00 00 01 00 00 00 00 00 00 00 ............... . 0020: 2c 00 00 00 15 01 01 00 bc 14 00 00 bc 14 00 00 ,.............. ^ ^ ^ ^ ^ ^ ^ .......................... Event Size struct trace_entry . 0030: 00 00 00 00 46 98 43 02 00 00 00 00 80 08 00 00 ....F.C........ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ptr to file name open flags . 0040: 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 ............... ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ . open mode padding 0x2cbb8 [0x50]: PERF_EVENT_SAMPLE (IP, 2): 5308: 0x7f0aab39e920 period: 1 Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Jason Baron Cc: Masami Hiramatsu --- kernel/trace/trace_syscalls.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9ee6386cf842..f837cccabcf7 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -301,6 +301,17 @@ struct trace_event event_syscall_exit = { }; #ifdef CONFIG_EVENT_PROFILE + +struct syscall_enter_record { + struct trace_entry entry; + unsigned long args[0]; +}; + +struct syscall_exit_record { + struct trace_entry entry; + unsigned long ret; +}; + static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); static int sys_prof_refcount_enter; @@ -308,8 +319,10 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { + struct syscall_enter_record *rec; struct syscall_metadata *sys_data; int syscall_nr; + int size; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -319,7 +332,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) if (!sys_data) return; - perf_tpcounter_event(sys_data->enter_id, 0, 1, NULL, 0); + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + do { + char raw_data[size]; + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + rec = (struct syscall_enter_record *) raw_data; + tracing_generic_entry_update(&rec->entry, 0, 0); + rec->entry.type = sys_data->enter_id; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + } while(0); } int reg_prof_syscall_enter(char *name) @@ -364,6 +394,7 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; + struct syscall_exit_record rec; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); @@ -374,7 +405,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - perf_tpcounter_event(sys_data->exit_id, 0, 1, NULL, 0); + tracing_generic_entry_update(&rec.entry, 0, 0); + rec.entry.type = sys_data->exit_id; + rec.ret = syscall_get_return_value(current, regs); + + perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); } int reg_prof_syscall_exit(char *name) -- cgit From 60d970c254b95ec7a0fc4c590b510253987b64a0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 23:37:26 +0200 Subject: tracing: Fix syscall tracing on !HAVE_FTRACE_SYSCALLS architectures The new syscall_regfunc()/unregfunc() functions rely on the existence of TIF_SYSCALL_FTRACE - but that TIF flag is only offered by HAVE_FTRACE_SYSCALLS. Cc: Frederic Weisbecker Cc: Jason Baron Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 070a42bb8920..35dd27adb82c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -579,6 +579,8 @@ __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ +#ifdef CONFIG_FTRACE_SYSCALLS + static DEFINE_MUTEX(regfunc_mutex); static int sys_tracepoint_refcount; @@ -615,3 +617,4 @@ void syscall_unregfunc(void) } mutex_unlock(®func_mutex); } +#endif -- cgit From 7ead8b8313d92b3a69a1a61b0dcbc4cd66c960dc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:56:28 +0800 Subject: tracing/events: Add module tracepoints Add trace points to trace module_load, module_free, module_get, module_put and module_request, and use trace_event facility to get the trace output. Here's the sample output: TASK-PID CPU# TIMESTAMP FUNCTION | | | | | <...>-42 [000] 1.758380: module_request: fb0 wait=1 call_site=fb_open ... <...>-60 [000] 3.269403: module_load: scsi_wait_scan <...>-60 [000] 3.269432: module_put: scsi_wait_scan call_site=sys_init_module refcnt=0 <...>-61 [001] 3.273168: module_free: scsi_wait_scan ... <...>-1021 [000] 13.836081: module_load: sunrpc <...>-1021 [000] 13.840589: module_put: sunrpc call_site=sys_init_module refcnt=-1 <...>-1027 [000] 13.848098: module_get: sunrpc call_site=try_module_get refcnt=0 <...>-1027 [000] 13.848308: module_get: sunrpc call_site=get_filesystem refcnt=1 <...>-1027 [000] 13.848692: module_put: sunrpc call_site=put_filesystem refcnt=0 ... modprobe-2587 [001] 1088.437213: module_load: trace_events_sample F modprobe-2587 [001] 1088.437786: module_put: trace_events_sample call_site=sys_init_module refcnt=0 Note: - the taints flag can be 'F', 'C' and/or 'P' if mod->taints != 0 - the module refcnt is percpu, so it can be negative in a specific cpu Signed-off-by: Li Zefan Acked-by: Rusty Russell Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Rusty Russell LKML-Reference: <4A891B3C.5030608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/kmod.c | 4 ++++ kernel/module.c | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 385c31a1bdbf..a92280870e30 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,8 @@ #include #include +#include + extern int max_threads; static struct workqueue_struct *khelper_wq; @@ -108,6 +110,8 @@ int __request_module(bool wait, const char *fmt, ...) return -ENOMEM; } + trace_module_request(module_name, wait, _RET_IP_); + ret = call_usermodehelper(modprobe_path, argv, envp, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); diff --git a/kernel/module.c b/kernel/module.c index fd1411403558..b1821438694e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,11 @@ #include #include +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL(module_get); + #if 0 #define DEBUGP printk #else @@ -940,6 +945,8 @@ void module_put(struct module *module) if (module) { unsigned int cpu = get_cpu(); local_dec(__module_ref_addr(module, cpu)); + trace_module_put(module, _RET_IP_, + local_read(__module_ref_addr(module, cpu))); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1491,6 +1498,8 @@ static int __unlink_module(void *_mod) /* Free a module, remove from lists, etc (must hold module_mutex). */ static void free_module(struct module *mod) { + trace_module_free(mod); + /* Delete from various lists */ stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); @@ -2358,6 +2367,8 @@ static noinline struct module *load_module(void __user *umod, /* Get rid of temporary copy */ vfree(hdr); + trace_module_load(mod); + /* Done! */ return mod; -- cgit From ba8b3a40ba7e06d00c27508f090803af90e8dbbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:55:18 +0800 Subject: tracing/syscalls: Fix to print parameter types When syscall tracing was implemented as a tracer, "syscall_arg_type" trace option could be set to enable the display of syscall parameter types. Now this option is gone since it's no longer a tracer, but the code is still there but dead. So we remove dead code and re-enable the printing of paramete types via the verbose option: # echo verbose > trace_options # echo syscalls > set_event # cat trace ... bash-3331 [000] 95.348937: sys_fcntl64 -> 0x1 bash-3331 [000] 95.348942: sys_close(unsigned int fd: a) ... Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron LKML-Reference: <4A891AF6.5050102@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f837cccabcf7..f130dacfeef4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -13,21 +13,6 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); -/* Option to display the parameters types */ -enum { - TRACE_SYSCALLS_OPT_TYPES = 0x1, -}; - -static struct tracer_opt syscalls_opts[] = { - { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, - { } -}; - -static struct tracer_flags syscalls_flags = { - .val = 0, /* By default: no parameters types */ - .opts = syscalls_opts -}; - enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -55,7 +40,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) for (i = 0; i < entry->nb_args; i++) { /* parameter types */ - if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + if (trace_flags & TRACE_ITER_VERBOSE) { ret = trace_seq_printf(s, "%s ", entry->types[i]); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit From 97d53202a5670a08b79c8ef2e4fff1c1ee21317c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:52:53 +0800 Subject: trace_stat: Fix missing entry in stat file One entry is missing in the output of a stat file. The cause is, when stat_seq_start() is called the 2nd time, we should start from the (pos-1)th elem in the rbtree but not pos, because pos == 0 is the header. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A891A65.70009@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 07c60b09258f..a4bb239eb987 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -203,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; struct rb_node *node; + int n = *pos; int i; /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) - return SEQ_START_TOKEN; + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } node = rb_first(&session->stat_root); - for (i = 0; node && i < *pos; i++) + for (i = 0; node && i < n; i++) node = rb_next(node); return node; -- cgit From 2fc5f0cff4cf1c4cd336d0f61f11bca6eeee1d84 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:53:37 +0800 Subject: trace_stack: Simplify seqfile code Extract duplicate code in t_start() and t_next(). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A891A91.4030602@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0da1cff08d67..0f6facb050a1 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = { }; static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, loff_t *pos) { - long i; + long n = *pos - 1; - (*pos)++; - - if (v == SEQ_START_TOKEN) - i = 0; - else { - i = *(long *)v; - i++; - } - - if (i >= max_stack_trace.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; - m->private = (void *)i; - + m->private = (void *)n; return &m->private; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) { - void *t = SEQ_START_TOKEN; - loff_t l = 0; + (*pos)++; + return __next(m, pos); +} +static void *t_start(struct seq_file *m, loff_t *pos) +{ local_irq_disable(); __raw_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; + return __next(m, pos); } static void t_stop(struct seq_file *m, void *p) -- cgit From 3be04b471b95b870bd129a138463756629e86f3f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Aug 2009 16:54:03 +0800 Subject: ftrace: Simplify seqfile code Use seq_release_private(). Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <4A891AAB.8090701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 094863416b2e..1993b7186cdb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1556,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -int ftrace_avail_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter = m->private; - - seq_release(inode, file); - kfree(iter); - - return 0; -} - static int ftrace_failures_open(struct inode *inode, struct file *file) { @@ -2427,14 +2416,14 @@ static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_failures_fops = { .open = ftrace_failures_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_filter_fops = { -- cgit From e6971969c331caa5c3c88cbd1be4f465b3355452 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:52:25 +0800 Subject: tracing/syscalls: Fix fields format for enter events The "format" file of a trace event is originally for parsers to parse ftrace binary output. But the "format" file of a syscall event can only be used by perfcounter, because it describes the format of struct syscall_enter_record not struct syscall_trace_enter. To fix this, we remove struct syscall_enter_record, and then struct syscall_trace_enter will be used by both perf profile and ftrace. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF39.1030404@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 51 ++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f130dacfeef4..d10daf0922b5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -90,26 +90,39 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) { int i; int nr; - int ret = 0; + int ret; struct syscall_metadata *entry; - int offset = sizeof(struct trace_entry); + struct syscall_trace_enter trace; + int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr((char *)call->data); + nr = syscall_name_to_nr(call->data); entry = syscall_nr_to_meta(nr); if (!entry) - return ret; + return 0; + + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr)); + if (!ret) + return 0; for (i = 0; i < entry->nb_args; i++) { ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%lu;\n", offset, + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, sizeof(unsigned long)); if (!ret) return 0; @@ -118,7 +131,7 @@ int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) trace_seq_printf(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%lulx%s", entry->args[i], + ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], sizeof(unsigned long), i == entry->nb_args - 1 ? "\", " : ", "); if (!ret) @@ -287,16 +300,6 @@ struct trace_event event_syscall_exit = { #ifdef CONFIG_EVENT_PROFILE -struct syscall_enter_record { - struct trace_entry entry; - unsigned long args[0]; -}; - -struct syscall_exit_record { - struct trace_entry entry; - unsigned long ret; -}; - static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); static int sys_prof_refcount_enter; @@ -304,7 +307,7 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { - struct syscall_enter_record *rec; + struct syscall_trace_enter *rec; struct syscall_metadata *sys_data; int syscall_nr; int size; @@ -328,9 +331,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - rec = (struct syscall_enter_record *) raw_data; - tracing_generic_entry_update(&rec->entry, 0, 0); - rec->entry.type = sys_data->enter_id; + rec = (struct syscall_trace_enter *) raw_data; + tracing_generic_entry_update(&rec->ent, 0, 0); + rec->ent.type = sys_data->enter_id; + rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); @@ -379,7 +383,7 @@ void unreg_prof_syscall_enter(char *name) static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; - struct syscall_exit_record rec; + struct syscall_trace_exit rec; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); @@ -390,8 +394,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - tracing_generic_entry_update(&rec.entry, 0, 0); - rec.entry.type = sys_data->exit_id; + tracing_generic_entry_update(&rec.ent, 0, 0); + rec.ent.type = sys_data->exit_id; + rec.nr = syscall_nr; rec.ret = syscall_get_return_value(current, regs); perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); -- cgit From 10a5b66f625904ad5a2867cf7a28073e1236ff32 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:53:05 +0800 Subject: tracing/syscalls: Add fields format for exit events Add "format" file for syscall exit events: # cat events/syscalls/sys_exit_open/format name: sys_exit_open ID: 344 format: field:unsigned short common_type; offset:0; size:2; field:unsigned char common_flags; offset:2; size:1; field:unsigned char common_preempt_count; offset:3; size:1; field:int common_pid; offset:4; size:4; field:int common_tgid; offset:8; size:4; field:int nr; offset:12; size:4; field:unsigned long ret; offset:16; size:4; Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF61.3060307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d10daf0922b5..7336b6c265d7 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -97,7 +97,7 @@ extern char *__bad_type_size(void); __bad_type_size() : \ #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) -int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) +int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; int nr; @@ -149,6 +149,22 @@ int ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s) return ret; } +int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) +{ + int ret; + struct syscall_trace_exit trace; + + ret = trace_seq_printf(s, + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + SYSCALL_FIELD(int, nr), + SYSCALL_FIELD(unsigned long, ret)); + if (!ret) + return 0; + + return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; -- cgit From 14be96c9716cb8c46dca94bd890defd7856e0734 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:53:52 +0800 Subject: tracing/events: Add ftrace_event_call param to define_fields() This parameter is needed by syscall events to add define_fields() handler. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAF90.6060801@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_export.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b568ade8f453..af8fb8ebef0b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -941,7 +941,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(); + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 956d4bc675e5..cf2c752a25bf 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -119,7 +119,7 @@ ftrace_format_##call(struct ftrace_event_call *unused, \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(void); \ +int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ @@ -184,9 +184,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ - struct ftrace_event_call *event_call = &event_##call; \ struct args field; \ int ret; \ \ -- cgit From e647d6b314266adb904d4b84973eda0afa856946 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:54:32 +0800 Subject: tracing/events: Add trace_define_common_fields() Extract duplicate code. Also prepare for the later patch. Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAFB8.1010304@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 22 ++++++++++++++++++++++ kernel/trace/trace_export.c | 8 +++----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index af8fb8ebef0b..9c7ecfb3416f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -62,6 +62,28 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#define __common_field(type, item) \ + ret = trace_define_field(call, #type, "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type)); \ + if (ret) \ + return ret; + +int trace_define_common_fields(struct ftrace_event_call *call) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(int, tgid); + + return ret; +} + #ifdef CONFIG_MODULES static void trace_destroy_fields(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index cf2c752a25bf..70875303ae46 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -189,11 +189,9 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type, 0); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ tstruct; \ \ -- cgit From 540b7b8d65575c80162f2a0f38e1d313c92a6042 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Aug 2009 15:54:51 +0800 Subject: tracing/syscalls: Add filtering support Add filtering support for syscall events: # echo 'mode == 0666' > events/syscalls/sys_enter_open # echo 'ret == 0' > events/syscalls/sys_exit_open # echo 1 > events/syscalls/sys_enter_open # echo 1 > events/syscalls/sys_exit_open # cat trace ... modprobe-3084 [001] 117.463140: sys_open(filename: 917d3e8, flags: 0, mode: 1b6) modprobe-3084 [001] 117.463176: sys_open -> 0x0 less-3086 [001] 117.510455: sys_open(filename: 9c6bdb8, flags: 8000, mode: 1b6) sendmail-2574 [001] 122.145840: sys_open(filename: b807a365, flags: 0, mode: 1b6) ... Signed-off-by: Li Zefan Cc: Jason Baron Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A8BAFCB.1040006@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 5 +++-- kernel/trace/trace_syscalls.c | 51 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9c7ecfb3416f..79d352027a61 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed) +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -83,6 +83,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } +EXPORT_SYMBOL_GPL(trace_define_common_fields); #ifdef CONFIG_MODULES diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7336b6c265d7..28e4dae4af21 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -165,6 +165,49 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); } +int syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta; + int ret; + int nr; + int i; + int offset = offsetof(typeof(trace), args); + + nr = syscall_name_to_nr(call->data); + meta = syscall_nr_to_meta(nr); + + if (!meta) + return 0; + + ret = trace_define_common_fields(call); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0); + offset += sizeof(unsigned long); + } + + return ret; +} + +int syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; + + ret = trace_define_common_fields(call); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0); + + return ret; +} + void ftrace_syscall_enter(struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; @@ -192,8 +235,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(sys_data->enter_event, entry, event)) + trace_current_buffer_unlock_commit(event, 0, 0); } void ftrace_syscall_exit(struct pt_regs *regs, long ret) @@ -220,8 +263,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(sys_data->exit_event, entry, event)) + trace_current_buffer_unlock_commit(event, 0, 0); } int reg_event_syscall_enter(void *ptr) -- cgit From 4539f07701b3f743580d19dc5d655fb8d21b0a3c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 20 Aug 2009 16:13:35 +0800 Subject: tracing/syscalls: Fix the output of syscalls with no arguments Before: # echo 1 > events/syscalls/sys_enter_sync/enable # cat events/syscalls/sys_enter_sync/format ... field:int nr; offset:12; size:4; print fmt: "# sync # cat trace ... sync-8950 [000] 2366.087670: sys_sync( After: # echo 1 > events/syscalls/sys_enter_sync/enable # cat events/syscalls/sys_enter_sync/format ... field:int nr; offset:12; size:4; print fmt: "" # sync # cat trace sync-2134 [001] 136.780735: sys_sync() Reported-by: Masami Hiramatsu Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Jason Baron Cc: Masami Hiramatsu LKML-Reference: <4A8D05AF.20103@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 28e4dae4af21..46c1b977a2cb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -46,15 +46,22 @@ print_syscall_enter(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], trace->args[i], - i == entry->nb_args - 1 ? ")" : ","); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + ret = trace_seq_putc(s, ')'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + end: - trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } @@ -129,24 +136,24 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) offset += sizeof(unsigned long); } - trace_seq_printf(s, "\nprint fmt: \""); + trace_seq_puts(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], sizeof(unsigned long), - i == entry->nb_args - 1 ? "\", " : ", "); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return 0; } + trace_seq_putc(s, '"'); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "((unsigned long)(REC->%s))%s", - entry->args[i], - i == entry->nb_args - 1 ? "\n" : ", "); + ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", + entry->args[i]); if (!ret) return 0; } - return ret; + return trace_seq_putc(s, '\n'); } int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) -- cgit From d88cb582325830698de5071fa8b8c9e933dbbcad Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Tue, 25 Aug 2009 07:00:02 -0700 Subject: tracing: Eliminate code duplication in kernel/tracepoint.c Signed-off-by: Anirban Sinha Reviewed-by: Li Zefan Cc: "Oleg Nesterov" LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 35dd27adb82c..06f165a44083 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -555,9 +555,6 @@ int tracepoint_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); - break; case MODULE_STATE_GOING: tracepoint_update_probe_range(mod->tracepoints, mod->tracepoints + mod->num_tracepoints); -- cgit From 667000011927b4fcc359beac4a2447889db6d349 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:11 -0700 Subject: tracing: Rename FTRACE_SYSCALLS for tracepoints s/HAVE_FTRACE_SYSCALLS/HAVE_SYSCALL_TRACEPOINTS/g s/TIF_SYSCALL_FTRACE/TIF_SYSCALL_TRACEPOINT/g The syscall enter/exit tracing is no longer specific to just ftrace, so they now have names that reflect their tie to tracepoints instead. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-2-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 4 ++-- kernel/tracepoint.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd764..06be85a7ef8c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool -config HAVE_FTRACE_SYSCALLS +config HAVE_SYSCALL_TRACEPOINTS bool config TRACER_MAX_TRACE @@ -211,7 +211,7 @@ config ENABLE_DEFAULT_TRACERS config FTRACE_SYSCALLS bool "Trace syscalls" - depends on HAVE_FTRACE_SYSCALLS + depends on HAVE_SYSCALL_TRACEPOINTS select GENERIC_TRACER select KALLSYMS help diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 06f165a44083..be86b9a01a09 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -590,7 +590,7 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } @@ -608,7 +608,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit From 3d27d8cb34fc156beb86de2338ca4029873a5cc6 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:12 -0700 Subject: tracing: Make syscall tracepoints conditional The syscall enter/exit tracepoints are only supported on archs that HAVE_SYSCALL_TRACEPOINTS, so the declarations should be #ifdef'ed. Also, the definition of syscall_regfunc and syscall_unregfunc should depend on this same config, rather than the ftrace-specific one. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan LKML-Reference: <1251150194-1713-3-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/tracepoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index be86b9a01a09..9e0a36f0e2a9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -576,7 +576,7 @@ __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -#ifdef CONFIG_FTRACE_SYSCALLS +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS static DEFINE_MUTEX(regfunc_mutex); static int sys_tracepoint_refcount; -- cgit From 97419875865859fd2403e66266c02ce028e2f5ab Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:13 -0700 Subject: tracing: Move tracepoint callbacks from declaration to definition It's not strictly correct for the tracepoint reg/unreg callbacks to occur when a client is hooking up, because the actual tracepoint may not be present yet. This happens to be fine for syscall, since that's in the core kernel, but it would cause problems for tracepoints defined in a module that hasn't been loaded yet. It also means the reg/unreg has to be EXPORTed for any modules to use the tracepoint (as in SystemTap). This patch removes DECLARE_TRACE_WITH_CALLBACK, and instead introduces DEFINE_TRACE_FN which stores the callbacks in struct tracepoint. The callbacks are used now when the active state of the tracepoint changes in set_tracepoint & disable_tracepoint. This also introduces TRACE_EVENT_FN, so ftrace events can also provide registration callbacks if needed. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-4-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/tracepoint.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9e0a36f0e2a9..1a6a453b7efb 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -243,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); + if (elem->regfunc && !elem->state && active) + elem->regfunc(); + else if (elem->unregfunc && elem->state && !active) + elem->unregfunc(); + /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new * probe callbacks array is consistent before setting a pointer to it. @@ -262,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { + if (elem->unregfunc && elem->state) + elem->unregfunc(); + elem->state = 0; rcu_assign_pointer(elem->funcs, NULL); } @@ -578,7 +586,7 @@ __initcall(init_tracepoints); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -static DEFINE_MUTEX(regfunc_mutex); +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; void syscall_regfunc(void) @@ -586,7 +594,6 @@ void syscall_regfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { @@ -595,7 +602,6 @@ void syscall_regfunc(void) read_unlock_irqrestore(&tasklist_lock, flags); } sys_tracepoint_refcount++; - mutex_unlock(®func_mutex); } void syscall_unregfunc(void) @@ -603,7 +609,6 @@ void syscall_unregfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); @@ -612,6 +617,5 @@ void syscall_unregfunc(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } - mutex_unlock(®func_mutex); } #endif -- cgit From 1c569f0264ea629c10bbab471dd0626ce4d3f19f Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:14 -0700 Subject: tracing: Create generic syscall TRACE_EVENTs This converts the syscall_enter/exit tracepoints into TRACE_EVENTs, so you can have generic ftrace events that capture all system calls with arguments and return values. These generic events are also renamed to sys_enter/exit, so they're more closely aligned to the specific sys_enter_foo events. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-5-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46c1b977a2cb..2698fe401ebd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -286,7 +287,7 @@ int reg_event_syscall_enter(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) - ret = register_trace_syscall_enter(ftrace_syscall_enter); + ret = register_trace_sys_enter(ftrace_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -311,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) sys_refcount_enter--; clear_bit(num, enabled_enter_syscalls); if (!sys_refcount_enter) - unregister_trace_syscall_enter(ftrace_syscall_enter); + unregister_trace_sys_enter(ftrace_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -327,7 +328,7 @@ int reg_event_syscall_exit(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) - ret = register_trace_syscall_exit(ftrace_syscall_exit); + ret = register_trace_sys_exit(ftrace_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); @@ -352,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) sys_refcount_exit--; clear_bit(num, enabled_exit_syscalls); if (!sys_refcount_exit) - unregister_trace_syscall_exit(ftrace_syscall_exit); + unregister_trace_sys_exit(ftrace_syscall_exit); mutex_unlock(&syscall_trace_lock); } @@ -418,7 +419,7 @@ int reg_prof_syscall_enter(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) - ret = register_trace_syscall_enter(prof_syscall_enter); + ret = register_trace_sys_enter(prof_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -442,7 +443,7 @@ void unreg_prof_syscall_enter(char *name) sys_prof_refcount_enter--; clear_bit(num, enabled_prof_enter_syscalls); if (!sys_prof_refcount_enter) - unregister_trace_syscall_enter(prof_syscall_enter); + unregister_trace_sys_enter(prof_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -479,7 +480,7 @@ int reg_prof_syscall_exit(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) - ret = register_trace_syscall_exit(prof_syscall_exit); + ret = register_trace_sys_exit(prof_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -503,7 +504,7 @@ void unreg_prof_syscall_exit(char *name) sys_prof_refcount_exit--; clear_bit(num, enabled_prof_exit_syscalls); if (!sys_prof_refcount_exit) - unregister_trace_syscall_exit(prof_syscall_exit); + unregister_trace_sys_exit(prof_syscall_exit); mutex_unlock(&syscall_trace_lock); } -- cgit From aa38e9fc3ea804290efd3a39316d7f7e6c945800 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:02 +0800 Subject: tracing/filters: Add filter_type to struct ftrace_event_field The type of a field is stored as a string in @type, and here we add @filter_type which is an enum value. This prepares for later patches, so we can specifically assign different @filter_type for the same @type. For example normally a "char *" field is treated as a ptr, but we may want it to be treated as a string when doing filting. Signed-off-by: Li Zefan LKML-Reference: <4A7B925E.9030605@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_events_filter.c | 23 ++++++++++++++--------- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 300ef788c976..64dda5709cb9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -755,6 +755,7 @@ struct ftrace_event_field { struct list_head link; char *name; char *type; + int filter_type; int offset; int size; int is_signed; @@ -800,6 +801,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); +extern int filter_assign_type(const char *type); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 79d352027a61..5740e90f4ca1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -44,9 +44,11 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, if (!field->type) goto err; + field->filter_type = filter_assign_type(type); field->offset = offset; field->size = size; field->is_signed = is_signed; + list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 490337abed75..22e6d822bbaa 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -476,11 +476,12 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } enum { - FILTER_STATIC_STRING = 1, - FILTER_DYN_STRING + FILTER_OTHER = 0, + FILTER_STATIC_STRING, + FILTER_DYN_STRING, }; -static int is_string_field(const char *type) +int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; @@ -488,12 +489,18 @@ static int is_string_field(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - return 0; + return FILTER_OTHER; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) return 0; return 1; @@ -550,7 +557,6 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - int string_type; int ret; pred->fn = filter_pred_none; @@ -578,9 +584,8 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - string_type = is_string_field(field->type); - if (string_type) { - if (string_type == FILTER_STATIC_STRING) + if (is_string_field(field)) { + if (field->filter_type == FILTER_STATIC_STRING) fn = filter_pred_string; else fn = filter_pred_strloc; -- cgit From 43b51ead3f752a3935116e5b1a94254b8573734f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:22 +0800 Subject: tracing/filters: Add __field_ext() to TRACE_EVENT Add __field_ext(), so a field can be assigned to a specific filter_type, which matches a corresponding filter function. For example, a later patch will allow this: __field_ext(const char *, str, FILTER_PTR_STR); Signed-off-by: Li Zefan LKML-Reference: <4A7B9272.6050709@cn.fujitsu.com> [ Fixed a -1 to FILTER_OTHER Forward ported to latest kernel. ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 11 ++++++++--- kernel/trace/trace_events_filter.c | 6 ------ kernel/trace/trace_export.c | 8 +++++--- kernel/trace/trace_syscalls.c | 6 ++++-- 4 files changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5740e90f4ca1..d33bcdeffe69 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,7 +28,8 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed) + const char *name, int offset, int size, int is_signed, + int filter_type) { struct ftrace_event_field *field; @@ -44,7 +45,11 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, if (!field->type) goto err; - field->filter_type = filter_assign_type(type); + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + field->offset = offset; field->size = size; field->is_signed = is_signed; @@ -68,7 +73,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); ret = trace_define_field(call, #type, "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type)); \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22e6d822bbaa..8a8e576733fc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -475,12 +475,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } -enum { - FILTER_OTHER = 0, - FILTER_STATIC_STRING, - FILTER_DYN_STRING, -}; - int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 70875303ae46..029a91f42287 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -166,7 +167,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ + sizeof(field.item), 0, FILTER_OTHER); \ if (ret) \ return ret; @@ -174,7 +175,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed); \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ if (ret) \ return ret; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46c1b977a2cb..97a2454760b0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -194,7 +194,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, - sizeof(unsigned long), 0); + sizeof(unsigned long), 0, + FILTER_OTHER); offset += sizeof(unsigned long); } @@ -210,7 +211,8 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0); + ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + FILTER_OTHER); return ret; } -- cgit From 87a342f5db69d53ea70493bb1ec69c9047677038 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Aug 2009 10:33:43 +0800 Subject: tracing/filters: Support filtering for char * strings Usually, char * entries are dangerous in traces because the string can be released whereas a pointer to it can still wait to be read from the ring buffer. But sometimes we can assume it's safe, like in case of RO data (eg: __file__ or __line__, used in bkl trace event). If these RO data are in a module and so is the call to the trace event, then it's safe, because the ring buffer will be flushed once this module get unloaded. To allow char * to be treated as a string: TRACE_EVENT(..., TP_STRUCT__entry( __field_ext(const char *, name, FILTER_PTR_STRING) ... ) ... ); The filtering will not dereference "char *" unless the developer explicitly sets FILTER_PTR_STR in __field_ext. Signed-off-by: Li Zefan LKML-Reference: <4A7B9287.90205@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8e576733fc..9f03082c81d8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -163,6 +163,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event, + int val1, int val2) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + + cmp = strncmp(*addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + /* * Filter predicate for dynamic sized arrays of characters. * These are implemented through a list of strings at the end @@ -489,7 +503,8 @@ int filter_assign_type(const char *type) static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING; + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) @@ -579,11 +594,16 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { + pred->str_len = field->size; + if (field->filter_type == FILTER_STATIC_STRING) fn = filter_pred_string; - else + else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - pred->str_len = field->size; + else { + fn = filter_pred_pchar; + pred->str_len = strlen(pred->str_val); + } } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); -- cgit From 5079f3261ffd7fe4a537679af695f2328943a245 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 25 Aug 2009 16:12:56 +0800 Subject: ftrace: Move setting of clock-source out of options There are many clock sources for the tracing system but we can only enable/disable one at a time with the trace/options file. We can move the setting of clock-source out of options and add a separate file for it: # cat trace_clock [local] global # echo global > trace_clock # cat trace_clock local [global] Signed-off-by: Zhao Lei LKML-Reference: <4A939D08.6050604@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 7 ++-- 2 files changed, 79 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ac204360a39..63dbc7ff213f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -323,12 +323,21 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", - "global-clock", "sleep-time", "graph-time", NULL }; +static struct { + u64 (*func)(void); + const char *name; +} trace_clocks[] = { + { trace_clock_local, "local" }, + { trace_clock_global, "global" }, +}; + +int trace_clock_id; + /* * ftrace_max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -2159,22 +2168,6 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; - - if (mask == TRACE_ITER_GLOBAL_CLK) { - u64 (*func)(void); - - if (enabled) - func = trace_clock_global; - else - func = trace_clock_local; - - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, func); - - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, func); - mutex_unlock(&trace_types_lock); - } } static ssize_t @@ -3142,6 +3135,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int bufiter = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + "%s%s%s%s", i ? " " : "", + i == trace_clock_id ? "[" : "", trace_clocks[i].name, + i == trace_clock_id ? "]" : ""); + bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char buf[64]; + const char *clockstr; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + trace_clock_id = i; + + mutex_lock(&trace_types_lock); + + ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + + mutex_unlock(&trace_types_lock); + + *fpos += cnt; + + return cnt; +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3179,6 +3228,12 @@ static const struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +static const struct file_operations trace_clock_fops = { + .open = tracing_open_generic, + .read = tracing_clock_read, + .write = tracing_clock_write, +}; + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -3918,6 +3973,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("trace_clock", 0644, d_tracer, NULL, + &trace_clock_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64dda5709cb9..654fd657bd03 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -568,6 +568,8 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; +extern int trace_clock_id; + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); @@ -656,9 +658,8 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK_MSGONLY = 0x10000, TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_GLOBAL_CLK = 0x80000, - TRACE_ITER_SLEEP_TIME = 0x100000, - TRACE_ITER_GRAPH_TIME = 0x200000, + TRACE_ITER_SLEEP_TIME = 0x80000, + TRACE_ITER_GRAPH_TIME = 0x100000, }; /* -- cgit From cd0980fc8add25e8ab12fcf1051c0f20cbc7c0c0 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 25 Aug 2009 14:50:27 +0200 Subject: tracing: Check invalid syscall nr while tracing syscalls Most arch syscall_get_nr() implementations returns -1 if the syscall number is not valid. Accessing the bit field without a check might result in a kernel oops (at least I saw it on s390 for ftrace selftest). Before this change, this problem did not occur, because the invalid syscall number (-1) caused syscall_nr_to_meta() to return NULL. There are at least two scenarios where syscall_get_nr() can return -1: 1. For example, ptrace stores an invalid syscall number, and thus, tracing code resets it. (see do_syscall_trace_enter in arch/s390/kernel/ptrace.c) 2. The syscall_regfunc() (kernel/tracepoint.c) sets the TIF_SYSCALL_FTRACE (now: TIF_SYSCALL_TRACEPOINT) flag for all threads which include kernel threads. However, the ftrace selftest triggers a kernel oops when testing syscall trace points: - The kernel thread is started as ususal (do_fork()), - tracing code sets TIF_SYSCALL_FTRACE, - the ret_from_fork() function is triggered and starts ftrace_syscall_exit() with an invalid syscall number. To avoid these scenarios, I suggest to check the syscall_nr. For instance, the ftrace selftest fails for s390 (with config option CONFIG_FTRACE_SYSCALLS set) and produces the following kernel oops. Unable to handle kernel pointer dereference at virtual kernel address 2000000000 Oops: 0038 [#1] PREEMPT SMP Modules linked in: CPU: 0 Not tainted 2.6.31-rc6-next-20090819-dirty #18 Process kthreadd (pid: 818, task: 000000003ea207e8, ksp: 000000003e813eb8) Krnl PSW : 0704100180000000 00000000000ea54c (ftrace_syscall_exit+0x58/0xdc) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 EA:3 Krnl GPRS: 0000000000000000 00000000000e0000 ffffffffffffffff 20000000008c2650 0000000000000007 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffffffffffffffff 000000003e813d78 000000003e813f58 0000000000505ba8 000000003e813e18 000000003e813d78 Krnl Code: 00000000000ea540: e330d0000008 ag %r3,0(%r13) 00000000000ea546: a7480007 lhi %r4,7 00000000000ea54a: 1442 nr %r4,%r2 >00000000000ea54c: e31030000090 llgc %r1,0(%r3) 00000000000ea552: 5410d008 n %r1,8(%r13) 00000000000ea556: 8a104000 sra %r1,0(%r4) 00000000000ea55a: 5410d00c n %r1,12(%r13) 00000000000ea55e: 1211 ltr %r1,%r1 Call Trace: ([<0000000000000000>] 0x0) [<000000000001fa22>] do_syscall_trace_exit+0x132/0x18c [<000000000002d0c4>] sysc_return+0x0/0x8 [<000000000001c738>] kernel_thread_starter+0x0/0xc Last Breaking-Event-Address: [<00000000000ea51e>] ftrace_syscall_exit+0x2a/0xdc Signed-off-by: Hendrik Brueckner Acked-by: Heiko Carstens Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Martin Schwidefsky Cc: Paul Mundt LKML-Reference: <20090825125027.GE4639@cetus.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 85291c4de406..cb7f600cb02a 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -227,6 +227,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_enter_syscalls)) return; @@ -257,6 +259,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_exit_syscalls)) return; -- cgit From cc3b13c11c567c69a6356be98d0c03ff11541d5c Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 25 Aug 2009 18:02:37 +0200 Subject: tracing: Don't trace kernel thread syscalls Kernel threads don't call syscalls using the sysenter/sysexit path. Instead they directly call the sys_* or do_* functions that implement the syscalls inside the kernel. The current syscall tracepoints only bind the sysenter/sysexit path, then it has no effect to trace the kernel thread calls to syscalls in that path. Setting the TIF_SYSCALL_TRACEPOINT flag is then useless for these. Actually there is only one case when a kernel thread can reach the usual syscall exit tracing path: when we create a kernel thread, the child comes to ret_from_fork and is the fork() return is then traced. But this information alone is useless, then we don't want to set the TIF flags for these threads. Kernel threads have task_struct->mm set to NULL. (Thanks to Heiko for that hint ;-) The idea is then to check the mm field in syscall_regfunc() and set the flag accordingly. Signed-off-by: Hendrik Brueckner Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Heiko Carstens Cc: Hendrik Brueckner LKML-Reference: <20090825160237.GG4639@cetus.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- kernel/tracepoint.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1a6a453b7efb..9489a0a9b1be 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -597,7 +597,9 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + /* Skip kernel threads. */ + if (t->mm) + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit From 57421dbbdc932d65f0e6a41ebb027a2bfe3d0669 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 24 Aug 2009 17:40:22 -0400 Subject: tracing: Convert event tracing code to use NR_syscalls Convert the syscalls event tracing code to use NR_syscalls, instead of FTRACE_SYSCALL_MAX. NR_syscalls is standard accross most arches, and reduces code confusion/complexity. Signed-off-by: Jason Baron Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: <9b4f1a84ecae57cc6599412772efa36f0d2b815b.1251146513.git.jbaron@redhat.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_syscalls.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb7f600cb02a..4f5fae6fad90 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -11,8 +11,8 @@ static DEFINE_MUTEX(syscall_trace_lock); static int sys_refcount_enter; static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) @@ -289,7 +289,7 @@ int reg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -312,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -330,7 +330,7 @@ int reg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -353,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; @@ -373,8 +373,8 @@ struct trace_event event_syscall_exit = { #ifdef CONFIG_EVENT_PROFILE -static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); static int sys_prof_refcount_enter; static int sys_prof_refcount_exit; @@ -420,7 +420,7 @@ int reg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -442,7 +442,7 @@ void unreg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -481,7 +481,7 @@ int reg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -503,7 +503,7 @@ void unreg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); -- cgit From c0729be99cb2b9d9749256254f1c40a801835896 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Aug 2009 22:23:52 -0400 Subject: tracing: remove legacy select of MARKERS by context switch tracing The context switch tracer was made before tracepoints were mature, and the original version used markers. This is no longer true and this patch removes the select. Reported-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 06be85a7ef8c..163fbfc2f39f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -60,7 +60,6 @@ config EVENT_TRACING bool config CONTEXT_SWITCH_TRACER - select MARKERS bool # All tracer options should select GENERIC_TRACER. For those options that are -- cgit From 5d4a9dba2d7fbab69f00dedd430d1788834a055a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Aug 2009 16:52:21 -0400 Subject: tracing: only show tracing_max_latency when latency tracer configured The tracing_max_latency file should only be present when one of the latency tracers ({preempt|irqs}off, wakeup*) are enabled. This patch also removes tracing_thresh when latency tracers are not enabled, as well as compiles out code that is only used for latency tracers. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++------------------------ kernel/trace/trace.h | 2 ++ 2 files changed, 52 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63dbc7ff213f..0f0881676dc9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,9 +43,6 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -338,45 +335,6 @@ static struct { int trace_clock_id; -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a raw_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct trace_array_cpu *data = tr->data[cpu]; - - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; - - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; - - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = task_uid(tsk); - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); -} - ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -420,6 +378,53 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_TRACER_MAX_TRACE +unsigned long __read_mostly tracing_max_latency; +unsigned long __read_mostly tracing_thresh; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = task_uid(tsk); + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -476,6 +481,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); } +#endif /* CONFIG_TRACER_MAX_TRACE */ /** * register_tracer - register a tracer with the ftrace system. @@ -3952,11 +3958,13 @@ static __init int tracer_init_debugfs(void) trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); +#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 654fd657bd03..e2c06b21dd82 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -473,12 +473,14 @@ void unregister_tracer(struct tracer *type); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +#ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, -- cgit From 8e254c1d183f0225ad21f9049641529e56cce4da Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 31 Aug 2009 16:49:41 +0800 Subject: tracing/filters: Defer pred allocation init_preds() allocates about 5392 bytes of memory (on x86_32) for a TRACE_EVENT. With my config, at system boot total memory occupied is: 5392 * (642 + 15) == 3459KB 642 == cat available_events | wc -l 15 == number of dirs in events/ftrace That's quite a lot, so we'd better defer memory allocation util it's needed, that's when filter is used. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi Cc: Masami Hiramatsu LKML-Reference: <4A9B8EA5.6020700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 50 +++++++++++++++++++++++++++++++------- kernel/trace/trace_export.c | 1 - 2 files changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f03082c81d8..c6b2edfb7fe9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -309,7 +309,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) struct event_filter *filter = call->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -322,7 +322,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, struct event_filter *filter = system->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -390,6 +390,9 @@ void destroy_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; + if (!filter) + return; + for (i = 0; i < MAX_FILTER_PRED; i++) { if (filter->preds[i]) filter_free_pred(filter->preds[i]); @@ -400,7 +403,7 @@ void destroy_preds(struct ftrace_event_call *call) call->filter = NULL; } -int init_preds(struct ftrace_event_call *call) +static int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; struct filter_pred *pred; @@ -410,7 +413,6 @@ int init_preds(struct ftrace_event_call *call) if (!call->filter) return -ENOMEM; - call->filter_active = 0; filter->n_preds = 0; filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); @@ -432,7 +434,28 @@ oom: return -ENOMEM; } -EXPORT_SYMBOL_GPL(init_preds); + +static int init_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + int err; + + list_for_each_entry(call, &ftrace_events, list) { + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + if (!call->filter) { + err = init_preds(call); + if (err) + return err; + } + } + + return 0; +} enum { FILTER_DISABLE_ALL, @@ -449,6 +472,9 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (!call->define_fields) continue; + if (strcmp(call->system, system->name) != 0) + continue; + if (flag == FILTER_INIT_NO_RESET) { call->filter->no_reset = false; continue; @@ -457,10 +483,8 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) continue; - if (!strcmp(call->system, system->name)) { - filter_disable_preds(call); - remove_filter_string(call->filter); - } + filter_disable_preds(call); + remove_filter_string(call->filter); } } @@ -1094,6 +1118,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); + err = init_preds(call); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -1139,6 +1167,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + err = init_subsystem_preds(system); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 029a91f42287..df1bf6e48bb9 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -135,7 +135,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ return 0; \ } \ -- cgit From 41b6a95d693319f804607b559893fbbd27498548 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 09:59:48 -0400 Subject: ring-buffer: do not reset while in a commit The callers of reset must ensure that no commit can be taking place at the time of the reset. If it does then we may corrupt the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index da2c59d8f486..79d6012bb1f1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3373,12 +3373,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); __raw_spin_unlock(&cpu_buffer->lock); + out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); -- cgit From 98277991a99734f3a31d638afb47d4484ac73e43 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 10:56:15 -0400 Subject: ring-buffer: do not swap buffers during a commit If a commit is taking place on a CPU ring buffer, do not allow it to be swapped. Return -EBUSY when this is detected instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 79d6012bb1f1..2878bd43a59c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3519,16 +3519,23 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; + ret = 0; + +out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - - ret = 0; out: return ret; } -- cgit From 1b959e18c4d6b4b981f887260b0f8e7939efa411 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:12:13 -0400 Subject: ring-buffer: remove unnecessary cpu_relax The loops in the ring buffer that use cpu_relax are not dependent on other CPUs. They simply came across some padding in the ring buffer and are skipping over them. It is a normal loop and does not require a cpu_relax. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2878bd43a59c..a05541a8fbae 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3132,10 +3132,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3160,10 +3158,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3209,10 +3205,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -3302,10 +3296,8 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } -- cgit From 7e9391cfedce34eb9786bfa69d7d545dc93ef930 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:02:09 -0400 Subject: ring-buffer: fix ring_buffer_read crossing pages When the ring buffer uses an iterator (static read mode, not on the fly reading), when it crosses a page boundery, it will skip the first entry on the next page. The reason is that the last entry of a page is usually padding if the page is not full. The padding will not be returned to the user. The problem arises on ring_buffer_read because it also increments the iterator. Because both the read and peek use the same rb_iter_peek, the rb_iter_peak will return the padding but also increment to the next item. This is because the ring_buffer_peek will not incerment it itself. The ring_buffer_read will increment it again and then call rb_iter_peek again to get the next item. But that will be the second item, not the first one on the page. The reason this never showed up before, is because the ftrace utility always calls ring_buffer_peek first and only uses ring_buffer_read to increment to the next item. The ring_buffer_peek will always keep the pointer to a valid item and not padding. This just hid the bug. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a05541a8fbae..9d939e7ca924 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3286,19 +3286,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + again: event = rb_iter_peek(iter, ts); if (!event) goto out; + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + rb_advance_iter(iter); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) - goto again; - return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); -- cgit From dc892f7339af2d125478b800edb9081d6149665b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 15:33:41 -0400 Subject: ring-buffer: remove ring_buffer_event_discard The function ring_buffer_event_discard can be used on any item in the ring buffer, even after the item was committed. This function provides no safety nets and is very race prone. An item may be safely removed from the ring buffer before it is committed with the ring_buffer_discard_commit. Since there are currently no users of this function, and because this function is racey and error prone, this patch removes it altogether. Note, removing this function also allows the counters to ignore all discarded events (patches will follow). Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9d939e7ca924..092fe0c8fdae 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2327,32 +2327,17 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -/** - * ring_buffer_event_discard - discard any event in the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - rb_event_discard(event); -} -EXPORT_SYMBOL_GPL(ring_buffer_event_discard); - /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * - * This is similar to ring_buffer_event_discard but must only be - * performed on an event that has not been committed yet. The difference - * is that this will also try to free the event from the ring buffer + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event -- cgit From a1863c212b7517afc2b13e549552ac322fb44cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 10:23:58 -0400 Subject: ring-buffer: do not count discarded events The latency tracers report the number of items in the trace buffer. This uses the ring buffer data to calculate this. Because discarded events are also counted, the numbers do not match the number of items that are printed. The ring buffer also adds a "padding" item to the end of each buffer page which also gets counted as a discarded item. This patch decrements the counter to the page entries on a discard. This allows us to ignore discarded entries while reading the buffer. Decrementing the counter is still safe since it can only happen while the committing flag is still set. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 092fe0c8fdae..c8d2a66e1d1f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -218,17 +218,12 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type_len == RINGBUF_TYPE_PADDING - && event->time_delta == 0; -} - -static inline int rb_discarded_event(struct ring_buffer_event *event) -{ - return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { + /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -1778,9 +1773,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; - /* Account for this as an entry */ - local_inc(&tail_page->entries); - local_inc(&cpu_buffer->entries); /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; @@ -2269,18 +2261,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - local_inc(&cpu_buffer->entries); - /* * The event first in the commit queue updates the * time stamp. */ if (rb_event_is_commit(cpu_buffer, event)) cpu_buffer->write_stamp += event->time_delta; +} +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -2327,6 +2324,46 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. + */ +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); +} + /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer @@ -2365,14 +2402,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; /* * The commit is still visible by the reader, so we - * must increment entries. + * must still update the timestamp. */ - local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -2884,8 +2922,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX - || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); -- cgit From 077c5407cd3231cf13472623995f0dfdda510d62 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 19:53:46 -0400 Subject: ring-buffer: disable all cpu buffers when one finds a problem Currently the way RB_WARN_ON works, is to disable either the current CPU buffer or all CPU buffers, depending on whether a ring_buffer or ring_buffer_per_cpu struct was passed into the macro. Most users of the RB_WARN_ON pass in the CPU buffer, so only the one CPU buffer gets disabled but the rest are still active. This may confuse users even though a warning is sent to the console. This patch changes the macro to disable the entire buffer even if the CPU buffer is passed in. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c8d2a66e1d1f..f83a42a79ee8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -467,14 +467,19 @@ struct ring_buffer_iter { }; /* buffer may be either ring_buffer or ring_buffer_per_cpu */ -#define RB_WARN_ON(buffer, cond) \ - ({ \ - int _____ret = unlikely(cond); \ - if (_____ret) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - _____ret; \ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ -- cgit From 8248ac052dfd1eb41819fbc0ca5c7a1667e7e70c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 12:27:41 -0400 Subject: tracing: print out start and stop in latency traces During development of the tracer, we would copy information from the live tracer to the max tracer with one memcpy. Since then we added a generic ring buffer and we handle the copies differently now. Unfortunately, we never copied the critical section information, and we lost the output: # => started at: kmem_cache_alloc # => ended at: kmem_cache_alloc This patch adds back the critical start and end copying as well as removes the unused "trace_idx" and "overrun" fields of the trace_array_cpu structure. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 +++++++++++-------- kernel/trace/trace.h | 3 --- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0f0881676dc9..df2c9f730ac6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -407,19 +407,22 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *max_data = tr->data[cpu]; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; + max_data = max_tr.data[cpu]; + max_data->saved_latency = tracing_max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = task_uid(tsk); - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; + max_data->pid = tsk->pid; + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; /* record this tasks comm */ tracing_record_cmdline(tsk); @@ -1501,7 +1504,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); - seq_puts(m, "#\n"); + seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e2c06b21dd82..f2af713a8bcc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -234,9 +234,6 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ - /* these fields get copied into max-trace: */ - unsigned long trace_idx; - unsigned long overrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; -- cgit From b8de7bd168fa54d059b16d3057b2f8a32cc5bdc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 31 Aug 2009 22:32:27 -0400 Subject: tracing: disable update max tracer while reading trace When reading the tracer from the trace file, updating the max latency may corrupt the output. This patch disables the tracing of the max latency while reading the trace file. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index df2c9f730ac6..e521f1e8f2bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -263,6 +263,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME; +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); + /** * trace_wake_up - wake up tasks waiting for trace input * @@ -442,6 +445,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct ring_buffer *buf = tr->buffer; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); @@ -469,6 +475,9 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); @@ -685,9 +694,6 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } -static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); - /** * ftrace_off_permanent - disable all ftrace code permanently * -- cgit From 621968cdb2563b667d6ecb484ba91ef4c3a797b3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 12:02:35 -0400 Subject: tracing: disable buffers and synchronize_sched before resetting Resetting the ring buffers while traces are happening can corrupt the ring buffer and disable it (no kernel crash to worry about). The safest thing to do is disable the ring buffers, call synchronize_sched() to wait for all current writers to finish and then reset the buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e521f1e8f2bb..9110329ecf77 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -658,12 +658,20 @@ void tracing_reset(struct trace_array *tr, int cpu) void tracing_reset_online_cpus(struct trace_array *tr) { + struct ring_buffer *buffer = tr->buffer; int cpu; + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) tracing_reset(tr, cpu); + + ring_buffer_record_enable(buffer); } void tracing_reset_current(int cpu) -- cgit From 76f0d07376388f32698ba51b6090a26b90c1342f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 12:12:39 -0400 Subject: tracing: remove users of tracing_reset The function tracing_reset is deprecated for outside use of trace.c. The new function to reset the the buffers is tracing_reset_online_cpus. The reason for this is that resetting the buffers while the event trace points are active can corrupt the buffers, because they may be writing at the time of reset. The tracing_reset_online_cpus disables writes and waits for current writers to finish. This patch replaces all users of tracing_reset except for the latency tracers. Those changes require more work and will be removed in the following patches. Signed-off-by: Steven Rostedt --- kernel/trace/kmemtrace.c | 4 +--- kernel/trace/trace.c | 7 ++----- kernel/trace/trace_boot.c | 4 +--- kernel/trace/trace_power.c | 4 +--- 4 files changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index dda53ccf749b..81b1645c8549 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void) static int kmem_trace_init(struct trace_array *tr) { - int cpu; kmemtrace_array = tr; - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); kmemtrace_start_probes(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9110329ecf77..54517a889791 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -550,7 +550,6 @@ __acquires(kernel_lock) if (type->selftest && !tracing_selftest_disabled) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int i; /* * Run a selftest on this tracer. @@ -559,8 +558,7 @@ __acquires(kernel_lock) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); current_trace = type; /* the test is responsible for initializing and enabling */ @@ -573,8 +571,7 @@ __acquires(kernel_lock) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); printk(KERN_CONT "PASSED\n"); } diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a29ef23ffb47..863139327816 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -41,14 +41,12 @@ void disable_boot_trace(void) static int boot_trace_init(struct trace_array *tr) { - int cpu; boot_trace = tr; if (!tr) return 0; - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); tracing_sched_switch_assign_trace(tr); return 0; diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8a30d9874cd4..a5d5a4f7745b 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -144,14 +144,12 @@ static void power_trace_reset(struct trace_array *tr) static int power_trace_init(struct trace_array *tr) { - int cpu; power_trace = tr; trace_power_enabled = 1; tracing_power_register(); - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); return 0; } -- cgit From c58b43218c1a04a0bcf338ea47406c759ac28e11 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Sep 2009 13:31:38 +0800 Subject: tracing/filters: Defer pred allocation, fix memory leak The predicates of an event and their filter structure are allocated when we create an event filter for the first time. These objects must be created once but each time we come with a new filter, we overwrite such pre-existing allocation, if any. Thus, this patch checks if the filter has already been allocated before going ahead. Spotted-by: Frederic Weisbecker Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Tom Zanussi Cc: Masami Hiramatsu LKML-Reference: <4A9CB1BA.3060402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c6b2edfb7fe9..93660fbbf629 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -409,6 +409,9 @@ static int init_preds(struct ftrace_event_call *call) struct filter_pred *pred; int i; + if (call->filter) + return 0; + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!call->filter) return -ENOMEM; @@ -447,11 +450,9 @@ static int init_subsystem_preds(struct event_subsystem *system) if (strcmp(call->system, system->name) != 0) continue; - if (!call->filter) { - err = init_preds(call); - if (err) - return err; - } + err = init_preds(call); + if (err) + return err; } return 0; -- cgit From 2f26ebd549b9ab55ac756b836ec759c11fe93f81 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 1 Sep 2009 11:06:29 -0400 Subject: tracing: use timestamp to determine start of latency traces Currently the latency tracers reset the ring buffer. Unfortunately if a commit is in process (due to a trace event), this can corrupt the ring buffer. When this happens, the ring buffer will detect the corruption and then permanently disable the ring buffer. The bug does not crash the system, but it does prevent further tracing after the bug is hit. Instead of reseting the trace buffers, the timestamp of the start of the trace is used instead. The buffers will still contain the previous data, but the output will not count any data that is before the timestamp of the trace. Note, this only affects the static trace output (trace) and not the runtime trace output (trace_pipe). The runtime trace output does not make sense for the latency tracers anyway. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 80 ++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 3 +- kernel/trace/trace_sched_wakeup.c | 7 +--- 4 files changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 54517a889791..7daf372e319a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -454,10 +454,6 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tr->buffer = max_tr.buffer; max_tr.buffer = buf; - ftrace_disable_cpu(); - ring_buffer_reset(tr->buffer); - ftrace_enable_cpu(); - __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); } @@ -483,7 +479,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) ftrace_disable_cpu(); - ring_buffer_reset(max_tr.buffer); ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); ftrace_enable_cpu(); @@ -1374,6 +1369,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return ent; } +static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct trace_array *tr = iter->tr; + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + tr->data[cpu]->skipped_entries = 0; + + if (!iter->buffer_iter[cpu]) + return; + + buf_iter = iter->buffer_iter[cpu]; + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + if (ts >= iter->tr->time_start) + break; + entries++; + ring_buffer_read(buf_iter, NULL); + } + + tr->data[cpu]->skipped_entries = entries; +} + /* * No necessary locking here. The worst thing which can * happen is loosing events consumed at the same time @@ -1412,10 +1438,9 @@ static void *s_start(struct seq_file *m, loff_t *pos) if (cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) - ring_buffer_iter_reset(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); } else - ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); - + tracing_iter_reset(iter, cpu_file); ftrace_enable_cpu(); @@ -1464,16 +1489,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total; - unsigned long entries; + unsigned long entries = 0; + unsigned long total = 0; + unsigned long count; const char *name = "preemption"; + int cpu; if (type) name = type->name; - entries = ring_buffer_entries(iter->tr->buffer); - total = entries + - ring_buffer_overruns(iter->tr->buffer); + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + total += count; + } else + total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + entries += count; + } seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1534,6 +1575,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; + if (iter->tr->data[iter->cpu]->skipped_entries) + return; + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -1796,19 +1840,23 @@ __tracing_open(struct inode *inode, struct file *file) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + /* stop the trace while dumping */ + tracing_stop(); + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); + tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); + tracing_iter_reset(iter, cpu); } - /* TODO stop tracer */ ret = seq_open(file, &tracer_seq_ops); if (ret < 0) { fail_ret = ERR_PTR(ret); @@ -1818,9 +1866,6 @@ __tracing_open(struct inode *inode, struct file *file) m = file->private_data; m->private = iter; - /* stop the trace while dumping */ - tracing_stop(); - mutex_unlock(&trace_types_lock); return iter; @@ -1831,6 +1876,7 @@ __tracing_open(struct inode *inode, struct file *file) ring_buffer_read_finish(iter->buffer_iter[cpu]); } free_cpumask_var(iter->started); + tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f2af713a8bcc..ca070de36227 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -241,6 +241,7 @@ struct trace_array_cpu { unsigned long nice; unsigned long policy; unsigned long rt_priority; + unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; uid_t uid; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b923d13e2fad..5555b75a0d12 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -178,7 +178,6 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(tr, cpu); trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } @@ -208,7 +207,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(tr, cpu); local_save_flags(flags); @@ -379,6 +377,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); + tracing_reset_online_cpus(tr); start_irqsoff_tracer(tr); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index eacb27225173..ad69f105a7c6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -186,11 +186,6 @@ out: static void __wakeup_reset(struct trace_array *tr) { - int cpu; - - for_each_possible_cpu(cpu) - tracing_reset(tr, cpu); - wakeup_cpu = -1; wakeup_prio = -1; @@ -204,6 +199,8 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; + tracing_reset_online_cpus(tr); + local_irq_save(flags); __raw_spin_lock(&wakeup_lock); __wakeup_reset(tr); -- cgit From f633903af2ceb0cec07d45e499a072b6593d0ed1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 12:35:16 -0400 Subject: tracing: make tracing_reset safe for external use Reseting the trace buffer without first disabling the buffer and waiting for any writers to complete, can corrupt the ring buffer. This patch makes the external version of tracing_reset safe from corruption by disabling the ring buffer and calling synchronize_sched. This version can no longer be called from interrupt context. But all those callers have been removed. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7daf372e319a..0418e2650d41 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -641,13 +641,26 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct trace_array *tr, int cpu) { ftrace_disable_cpu(); ring_buffer_reset_cpu(tr->buffer, cpu); ftrace_enable_cpu(); } +void tracing_reset(struct trace_array *tr, int cpu) +{ + struct ring_buffer *buffer = tr->buffer; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + __tracing_reset(tr, cpu); + + ring_buffer_record_enable(buffer); +} + void tracing_reset_online_cpus(struct trace_array *tr) { struct ring_buffer *buffer = tr->buffer; @@ -661,7 +674,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + __tracing_reset(tr, cpu); ring_buffer_record_enable(buffer); } -- cgit From e77405ad80f53966524b5c31244e13fbbbecbd84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Sep 2009 14:17:06 -0400 Subject: tracing: pass around ring buffer instead of tracer The latency tracers (irqsoff and wakeup) can swap trace buffers on the fly. If an event is happening and has reserved data on one of the buffers, and the latency tracer swaps the global buffer with the max buffer, the result is that the event may commit the data to the wrong buffer. This patch changes the API to the trace recording to be recieve the buffer that was used to reserve a commit. Then this buffer can be passed in to the commit. Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 12 ++-- kernel/trace/trace.c | 117 ++++++++++++++++++++--------------- kernel/trace/trace.h | 17 ++--- kernel/trace/trace_boot.c | 12 ++-- kernel/trace/trace_events.c | 6 +- kernel/trace/trace_functions_graph.c | 14 +++-- kernel/trace/trace_mmiotrace.c | 10 +-- kernel/trace/trace_power.c | 18 ++++-- kernel/trace/trace_sched_switch.c | 18 +++--- kernel/trace/trace_syscalls.c | 18 +++--- 10 files changed, 143 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1090b0aed9ba..243bafc2ec90 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, 0, pc); if (!event) @@ -96,7 +98,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } } @@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; @@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, 0, pc); if (!event) @@ -252,7 +256,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); return; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0418e2650d41..0c61836e30e7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -169,10 +169,11 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); -int filter_current_check_discard(struct ftrace_event_call *call, void *rec, +int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) { - return filter_check_discard(call, rec, global_trace.buffer, event); + return filter_check_discard(call, rec, buffer, event); } EXPORT_SYMBOL_GPL(filter_current_check_discard); @@ -887,14 +888,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, int pc) +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) { struct ring_buffer_event *event; - event = ring_buffer_lock_reserve(tr->buffer, len); + event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) { struct trace_entry *ent = ring_buffer_event_data(event); @@ -905,53 +907,59 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static inline void __trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) +static inline void +__trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { - ring_buffer_unlock_commit(tr->buffer, event); + ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); + ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_userstack(buffer, flags, pc); if (wake) trace_wake_up(); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(tr, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } struct ring_buffer_event * -trace_current_buffer_lock_reserve(int type, unsigned long len, +trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, + int type, unsigned long len, unsigned long flags, int pc) { - return trace_buffer_lock_reserve(&global_trace, + *current_rb = global_trace.buffer; + return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) { - ring_buffer_discard_commit(global_trace.buffer, event); + ring_buffer_discard_commit(buffer, event); } EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); @@ -961,6 +969,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -968,7 +977,7 @@ trace_function(struct trace_array *tr, if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), + event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) return; @@ -976,8 +985,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } void @@ -990,7 +999,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_array *tr, +static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { @@ -999,7 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, struct stack_entry *entry; struct stack_trace trace; - event = trace_buffer_lock_reserve(tr, TRACE_STACK, + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry), flags, pc); if (!event) return; @@ -1012,26 +1021,27 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } -void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc) +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc); } void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc); } -void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) +void +ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; @@ -1041,7 +1051,7 @@ void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; - event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) return; @@ -1055,8 +1065,8 @@ void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } #ifdef UNUSED @@ -1075,9 +1085,10 @@ ftrace_trace_special(void *__tr, { struct ring_buffer_event *event; struct trace_array *tr = __tr; + struct ring_buffer *buffer = tr->buffer; struct special_entry *entry; - event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, + event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, sizeof(*entry), 0, pc); if (!event) return; @@ -1085,7 +1096,7 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } void @@ -1131,6 +1142,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct bprint_entry *entry; @@ -1163,7 +1175,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) goto out_unlock; size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1171,8 +1185,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1194,6 +1208,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; int cpu, len = 0, size, pc; @@ -1222,7 +1237,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) trace_buf[len] = 0; size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1230,8 +1247,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ca070de36227..4d30414fe19a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -415,12 +415,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, - int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, + int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); @@ -481,10 +482,10 @@ void update_max_tr_single(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc); -void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, +void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 863139327816..19bfc75d467e 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -130,6 +130,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_boot_call *entry; struct trace_array *tr = boot_trace; @@ -142,13 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } @@ -156,6 +158,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_boot_ret *entry; struct trace_array *tr = boot_trace; @@ -165,13 +168,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d33bcdeffe69..78b1ed230177 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1438,6 +1438,7 @@ static void function_test_events_call(unsigned long ip, unsigned long parent_ip) { struct ring_buffer_event *event; + struct ring_buffer *buffer; struct ftrace_entry *entry; unsigned long flags; long disabled; @@ -1455,7 +1456,8 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + event = trace_current_buffer_lock_reserve(&buffer, + TRACE_FN, sizeof(*entry), flags, pc); if (!event) goto out; @@ -1463,7 +1465,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3f4a251b7d16..b3749a2c3132 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -173,19 +173,20 @@ static int __trace_graph_entry(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return 0; - event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT, + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); return 1; } @@ -236,19 +237,20 @@ static void __trace_graph_return(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET, + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index d53b45ed0806..c4c9bbda53d3 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,11 +307,12 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -319,7 +320,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -333,11 +334,12 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -345,7 +347,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a5d5a4f7745b..fe1a00f1445a 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it) { struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_power *entry; struct trace_array_cpu *data; struct trace_array *tr = power_trace; @@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it) if (!trace_power_enabled) return; + buffer = tr->buffer; + preempt_disable(); it->end = ktime_get(); data = tr->data[smp_processor_id()]; - event = trace_buffer_lock_reserve(tr, TRACE_POWER, + event = trace_buffer_lock_reserve(buffer, TRACE_POWER, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } @@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, { struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_power *entry; struct trace_array_cpu *data; struct trace_array *tr = power_trace; @@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, if (!trace_power_enabled) return; + buffer = tr->buffer; + memset(it, 0, sizeof(struct power_trace)); it->state = level; it->type = type; @@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, it->end = it->stamp; data = tr->data[smp_processor_id()]; - event = trace_buffer_lock_reserve(tr, TRACE_POWER, + event = trace_buffer_lock_reserve(buffer, TRACE_POWER, sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e1285d7b5488..5fca0f51fde4 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,10 +28,11 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - event = trace_buffer_lock_reserve(tr, TRACE_CTX, + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, sizeof(*entry), flags, pc); if (!event) return; @@ -44,8 +45,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); } static void @@ -86,8 +87,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->buffer; - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); if (!event) return; @@ -100,10 +102,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr->buffer, flags, 6, pc); + ftrace_trace_userstack(tr->buffer, flags, pc); } static void diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4f5fae6fad90..8712ce3c6a0e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -223,6 +223,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int size; int syscall_nr; @@ -238,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(sys_data->enter_id, size, - 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, + size, 0, 0); if (!event) return; @@ -247,8 +248,9 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_current_check_discard(sys_data->enter_event, entry, event)) - trace_current_buffer_unlock_commit(event, 0, 0); + if (!filter_current_check_discard(buffer, sys_data->enter_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } void ftrace_syscall_exit(struct pt_regs *regs, long ret) @@ -256,6 +258,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); @@ -268,7 +271,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(sys_data->exit_id, + event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, sizeof(*entry), 0, 0); if (!event) return; @@ -277,8 +280,9 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_current_check_discard(sys_data->exit_event, entry, event)) - trace_current_buffer_unlock_commit(event, 0, 0); + if (!filter_current_check_discard(buffer, sys_data->exit_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } int reg_event_syscall_enter(void *ptr) -- cgit From 659372d3e42a3e17a2e042d38a8bcdb94bfbe797 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 19:11:07 -0400 Subject: tracing: add trace_array_printk for internal tracers to use This patch adds a trace_array_printk to allow a tracer to use the trace_printk on its own trace array. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++++++++++-- kernel/trace/trace.h | 5 +++++ 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0c61836e30e7..ef08328eb28d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1201,7 +1201,23 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; @@ -1209,7 +1225,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct ring_buffer *buffer; - struct trace_array *tr = &global_trace; struct trace_array_cpu *data; int cpu, len = 0, size, pc; struct print_entry *entry; @@ -1260,6 +1275,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) return len; } + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_printk(&global_trace, ip, fmt, args); +} EXPORT_SYMBOL_GPL(trace_vprintk); enum trace_file_type { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4d30414fe19a..fa1dccb579d5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -566,6 +566,11 @@ extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...); extern unsigned long trace_flags; -- cgit From e8165dbb03ed04d798163ee512074b9a9466a9c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Sep 2009 19:13:05 -0400 Subject: tracing: report error in trace if we fail to swap latency buffer The irqsoff tracer will fail to swap the cpu buffer with the max buffer if it preempts a commit. Instead of ignoring this, this patch makes the tracer report it if the last max latency failed due to preempting a current commit. The output of the latency tracer will look like this: # tracer: irqsoff # # irqsoff latency trace v1.1.5 on 2.6.31-rc5 # -------------------------------------------------------------------- # latency: 112 us, #1/1, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: -4281 (uid:0 nice:0 policy:0 rt_prio:0) # ----------------- # => started at: save_args # => ended at: __do_softirq # # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / # ||||| delay # cmd pid ||||| time | caller # \ / ||||| \ | / bash-4281 1d.s6 265us : update_max_tr_single: Failed to swap buffers due to commit in progress Note the latency time and the functions that disabled the irqs or preemption will still be listed. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ef08328eb28d..6df9861fde6b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -482,9 +482,20 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + */ + trace_array_printk(&max_tr, _THIS_IP_, + "Failed to swap buffers due to commit in progress\n"); + } + ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN); + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); -- cgit From 62f0b3eb5cb58931a02ee4e599e19c80a171e351 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 14:11:34 -0400 Subject: ring-buffer: check for swapped buffers in start of committing Because the irqsoff tracer can swap an internal CPU buffer, it is possible that a swap happens between the start of the write and before the committing bit is set (the committing bit will disable swapping). This patch adds a check for this and will fail the write if it detects it. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f83a42a79ee8..1766c0e8db5a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2073,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; @@ -2083,6 +2084,19 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, rb_start_commit(cpu_buffer); + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } + length = rb_calculate_event_length(length); again: /* @@ -2243,7 +2257,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; @@ -2476,7 +2490,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; -- cgit From 85bac32c4a52c592b857f2c360cc5ec93a097d70 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Sep 2009 14:24:40 -0400 Subject: ring-buffer: only enable ring_buffer_swap_cpu when needed Since the ability to swap the cpu buffers adds a small overhead to the recording of a trace, we only want to add it when needed. Only the irqsoff and preemptoff tracers use this feature, and both are not recommended for production kernels. This patch disables its use when neither irqsoff nor preemptoff is configured. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ring_buffer.c | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 163fbfc2f39f..1ea0d1234f4a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -62,6 +62,12 @@ config EVENT_TRACING config CONTEXT_SWITCH_TRACER bool +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -146,6 +152,7 @@ config IRQSOFF_TRACER select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -167,6 +174,7 @@ config PREEMPT_TRACER depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help This option measures the time spent in preemption off critical sections, with microsecond accuracy. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1766c0e8db5a..454e74e718cf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2084,6 +2084,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, rb_start_commit(cpu_buffer); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. @@ -2096,6 +2097,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, local_dec(&cpu_buffer->commits); return NULL; } +#endif length = rb_calculate_event_length(length); again: @@ -3498,6 +3500,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with @@ -3573,6 +3576,7 @@ out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer -- cgit