From f29ac756a40d0f1bb07d682ea521e7b666ff06d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 19 Jun 2009 18:27:26 +0200
Subject: perf_counter: Optimize perf_swcounter_event()

Similar to tracepoints, use an enable variable to reduce
overhead when unused.

Only look for a counter of a particular event type when we know
there is at least one in the system.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea4..7515c7695428 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3317,8 +3317,8 @@ out:
 	put_cpu_var(perf_cpu_context);
 }
 
-void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
 		.regs = regs,
@@ -3509,9 +3509,19 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 }
 #endif
 
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+	u64 event = counter->attr.config;
+
+	atomic_dec(&perf_swcounter_enabled[event]);
+}
+
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct pmu *pmu = NULL;
+	u64 event = counter->attr.config;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -3520,7 +3530,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->attr.config) {
+	switch (event) {
 	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3541,6 +3551,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
+		atomic_inc(&perf_swcounter_enabled[event]);
+		counter->destroy = sw_perf_counter_destroy;
 		pmu = &perf_ops_generic;
 		break;
 	}
-- 
cgit 


From b84fbc9fb1d943e2c5f4efe52ed0e3c93a4bdb6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 22 Jun 2009 13:57:40 +0200
Subject: perf_counter: Push inherit into perf_counter_alloc()

Teach perf_counter_alloc() about inheritance so that we can
optimize the inherit path in the next patch.

Remove the child_counter->atrr.inherit = 1 line because the
only way to get there is if parent_counter->attr.inherit == 1
and we copy the attrs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7515c7695428..0a45490f4029 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3568,6 +3568,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
+		   struct perf_counter *parent_counter,
 		   gfp_t gfpflags)
 {
 	const struct pmu *pmu;
@@ -3603,6 +3604,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	counter->ctx		= ctx;
 	counter->oncpu		= -1;
 
+	counter->parent		= parent_counter;
+
 	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
 	counter->id		= atomic64_inc_return(&perf_counter_id);
 
@@ -3827,7 +3830,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	}
 
 	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
-				     GFP_KERNEL);
+				     NULL, GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
 		goto err_put_context;
@@ -3893,7 +3896,8 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
-					   group_leader, GFP_KERNEL);
+					   group_leader, parent_counter,
+					   GFP_KERNEL);
 	if (IS_ERR(child_counter))
 		return child_counter;
 	get_ctx(child_ctx);
@@ -3916,12 +3920,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	 */
 	add_counter_to_ctx(child_counter, child_ctx);
 
-	child_counter->parent = parent_counter;
-	/*
-	 * inherit into child's child as well:
-	 */
-	child_counter->attr.inherit = 1;
-
 	/*
 	 * Get a reference to the parent filp - we will fput it
 	 * when the child counter exits. This is safe to do because
-- 
cgit 


From f344011ccb85469445369153c3d27c4ee4bc2ac8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 22 Jun 2009 13:58:35 +0200
Subject: perf_counter: Optimize perf_counter_alloc()'s inherit case

We don't need to add usage counts for swcounter and attr usage
models for inherited counters since the parent counter will
always have one, which suffices to generate the needed output.

This avoids up to 3 global atomic increments per inherited
counter.

LKML-Reference: <new-submission>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0a45490f4029..c2b19c111718 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1508,11 +1508,13 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
-	atomic_dec(&nr_counters);
-	if (counter->attr.mmap)
-		atomic_dec(&nr_mmap_counters);
-	if (counter->attr.comm)
-		atomic_dec(&nr_comm_counters);
+	if (!counter->parent) {
+		atomic_dec(&nr_counters);
+		if (counter->attr.mmap)
+			atomic_dec(&nr_mmap_counters);
+		if (counter->attr.comm)
+			atomic_dec(&nr_comm_counters);
+	}
 
 	if (counter->destroy)
 		counter->destroy(counter);
@@ -3515,6 +3517,8 @@ static void sw_perf_counter_destroy(struct perf_counter *counter)
 {
 	u64 event = counter->attr.config;
 
+	WARN_ON(counter->parent);
+
 	atomic_dec(&perf_swcounter_enabled[event]);
 }
 
@@ -3551,8 +3555,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
-		atomic_inc(&perf_swcounter_enabled[event]);
-		counter->destroy = sw_perf_counter_destroy;
+		if (!counter->parent) {
+			atomic_inc(&perf_swcounter_enabled[event]);
+			counter->destroy = sw_perf_counter_destroy;
+		}
 		pmu = &perf_ops_generic;
 		break;
 	}
@@ -3663,11 +3669,13 @@ done:
 
 	counter->pmu = pmu;
 
-	atomic_inc(&nr_counters);
-	if (counter->attr.mmap)
-		atomic_inc(&nr_mmap_counters);
-	if (counter->attr.comm)
-		atomic_inc(&nr_comm_counters);
+	if (!counter->parent) {
+		atomic_inc(&nr_counters);
+		if (counter->attr.mmap)
+			atomic_inc(&nr_mmap_counters);
+		if (counter->attr.comm)
+			atomic_inc(&nr_comm_counters);
+	}
 
 	return counter;
 }
-- 
cgit 


From f6faac71d502be1c29c81b2f45657662c3b84470 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 23 Jun 2009 17:24:40 -0700
Subject: rcu: Mark Hierarchical RCU no longer experimental

Removes the warnings about Hierarchical RCU being experimental,
given that it has gone through almost six months of being the
default RCU in mainline for the x86 with very little trouble.

This makes hierarchical-RCU bootup look less scary.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: dipankar@in.ibm.com
Cc: dhowells@redhat.com
Cc: lethal@linux-sh.org
Cc: kernel@wantstofly.org
Cc: cl@linux-foundation.org
Cc: schamp@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..7717b95c2027 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1533,7 +1533,7 @@ void __init __rcu_init(void)
 	int j;
 	struct rcu_node *rnp;
 
-	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+	printk(KERN_INFO "Hierarchical RCU implementation.\n");
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
@@ -1546,7 +1546,6 @@ void __init __rcu_init(void)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
-	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
 }
 
 module_param(blimit, int, 0);
-- 
cgit 


From 7f8b4e4e0988dadfd22330fd147ad2453e19f510 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 22 Jun 2009 14:34:35 +0200
Subject: perf_counter: Add scale information to the mmap control page

Add the needed time scale to the self-profile mmap information.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c2b19c111718..23614adab475 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1782,6 +1782,12 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
+	userpg->time_enabled = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+
+	userpg->time_running = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+
 	barrier();
 	++userpg->lock;
 	preempt_enable();
-- 
cgit 


From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 22 Jun 2009 16:35:24 +0200
Subject: perf_counter, x86: Add mmap counter read support

Update the mmap control page with the needed information to
use the userspace RDPMC instruction for self monitoring.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 23614adab475..02994a719e27 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1753,6 +1753,14 @@ int perf_counter_task_disable(void)
 	return 0;
 }
 
+static int perf_counter_index(struct perf_counter *counter)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return 0;
+
+	return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -1777,7 +1785,7 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	preempt_disable();
 	++userpg->lock;
 	barrier();
-	userpg->index = counter->hw.idx;
+	userpg->index = perf_counter_index(counter);
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
-- 
cgit 


From 38b200d67636a30cb8dc1508137908e7a649b5c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Jun 2009 20:13:11 +0200
Subject: perf_counter: Add PERF_EVENT_READ

Provide a read() like event which can be used to log the
counter value at specific sites such as child->parent
folding on exit.

In order to be useful, we log the counter parent ID, not the
actual counter ID, since userspace can only relate parent
IDs to perf_counter_attr constructs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 02994a719e27..a72c20e91953 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2623,6 +2623,66 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	perf_output_end(&handle);
 }
 
+/*
+ * read event
+ */
+
+struct perf_read_event {
+	struct perf_event_header	header;
+
+	u32				pid;
+	u32				tid;
+	u64				value;
+	u64				format[3];
+};
+
+static void
+perf_counter_read_event(struct perf_counter *counter,
+			struct task_struct *task)
+{
+	struct perf_output_handle handle;
+	struct perf_read_event event = {
+		.header = {
+			.type = PERF_EVENT_READ,
+			.misc = 0,
+			.size = sizeof(event) - sizeof(event.format),
+		},
+		.pid = perf_counter_pid(counter, task),
+		.tid = perf_counter_tid(counter, task),
+		.value = atomic64_read(&counter->count),
+	};
+	int ret, i = 0;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		event.header.size += sizeof(u64);
+		event.format[i++] = counter->total_time_enabled;
+	}
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		event.header.size += sizeof(u64);
+		event.format[i++] = counter->total_time_running;
+	}
+
+	if (counter->attr.read_format & PERF_FORMAT_ID) {
+		u64 id;
+
+		event.header.size += sizeof(u64);
+		if (counter->parent)
+			id = counter->parent->id;
+		else
+			id = counter->id;
+
+		event.format[i++] = id;
+	}
+
+	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+	if (ret)
+		return;
+
+	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_end(&handle);
+}
+
 /*
  * fork tracking
  */
@@ -3985,10 +4045,13 @@ static int inherit_group(struct perf_counter *parent_counter,
 }
 
 static void sync_child_counter(struct perf_counter *child_counter,
-			       struct perf_counter *parent_counter)
+			       struct task_struct *child)
 {
+	struct perf_counter *parent_counter = child_counter->parent;
 	u64 child_val;
 
+	perf_counter_read_event(child_counter, child);
+
 	child_val = atomic64_read(&child_counter->count);
 
 	/*
@@ -4017,7 +4080,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
 
 static void
 __perf_counter_exit_task(struct perf_counter *child_counter,
-			 struct perf_counter_context *child_ctx)
+			 struct perf_counter_context *child_ctx,
+			 struct task_struct *child)
 {
 	struct perf_counter *parent_counter;
 
@@ -4031,7 +4095,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter,
 	 * counters need to be zapped - but otherwise linger.
 	 */
 	if (parent_counter) {
-		sync_child_counter(child_counter, parent_counter);
+		sync_child_counter(child_counter, child);
 		free_counter(child_counter);
 	}
 }
@@ -4093,7 +4157,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
-		__perf_counter_exit_task(child_counter, child_ctx);
+		__perf_counter_exit_task(child_counter, child_ctx, child);
 
 	/*
 	 * If the last counter was a group counter, it will have appended all
-- 
cgit 


From bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Jun 2009 21:11:59 +0200
Subject: perf_counter: Implement more accurate per task statistics

With the introduction of PERF_EVENT_READ we have the
possibility to provide accurate counter values for
individual tasks in a task hierarchy.

However, due to the lazy context switching used for similar
counter contexts our current per task counts are way off.

In order to maintain some of the lazy switch benefits we
don't disable it out-right, but simply iterate the active
counters and flip the values between the contexts.

This only reads the counters but does not need to reprogram
the full PMU.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a72c20e91953..385ca51c6e60 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
+	if (counter->attr.inherit_stat)
+		ctx->nr_stat++;
 }
 
 /*
@@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
+	if (counter->attr.inherit_stat)
+		ctx->nr_stat--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -1006,6 +1010,76 @@ static int context_equiv(struct perf_counter_context *ctx1,
 		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
+static void __perf_counter_read(void *counter);
+
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+				     struct perf_counter *next_counter)
+{
+	u64 value;
+
+	if (!counter->attr.inherit_stat)
+		return;
+
+	/*
+	 * Update the counter value, we cannot use perf_counter_read()
+	 * because we're in the middle of a context switch and have IRQs
+	 * disabled, which upsets smp_call_function_single(), however
+	 * we know the counter must be on the current CPU, therefore we
+	 * don't need to use it.
+	 */
+	switch (counter->state) {
+	case PERF_COUNTER_STATE_ACTIVE:
+		__perf_counter_read(counter);
+		break;
+
+	case PERF_COUNTER_STATE_INACTIVE:
+		update_counter_times(counter);
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * In order to keep per-task stats reliable we need to flip the counter
+	 * values when we flip the contexts.
+	 */
+	value = atomic64_read(&next_counter->count);
+	value = atomic64_xchg(&counter->count, value);
+	atomic64_set(&next_counter->count, value);
+
+	/*
+	 * XXX also sync time_enabled and time_running ?
+	 */
+}
+
+#define list_next_entry(pos, member) \
+	list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+				   struct perf_counter_context *next_ctx)
+{
+	struct perf_counter *counter, *next_counter;
+
+	if (!ctx->nr_stat)
+		return;
+
+	counter = list_first_entry(&ctx->event_list,
+				   struct perf_counter, event_entry);
+
+	next_counter = list_first_entry(&next_ctx->event_list,
+					struct perf_counter, event_entry);
+
+	while (&counter->event_entry != &ctx->event_list &&
+	       &next_counter->event_entry != &next_ctx->event_list) {
+
+		__perf_counter_sync_stat(counter, next_counter);
+
+		counter = list_next_entry(counter, event_entry);
+		next_counter = list_next_entry(counter, event_entry);
+	}
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -1061,6 +1135,8 @@ void perf_counter_task_sched_out(struct task_struct *task,
 			ctx->task = next;
 			next_ctx->task = task;
 			do_switch = 0;
+
+			perf_counter_sync_stat(ctx, next_ctx);
 		}
 		spin_unlock(&next_ctx->lock);
 		spin_unlock(&ctx->lock);
@@ -1350,7 +1426,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __read(void *info)
+static void __perf_counter_read(void *info)
 {
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
@@ -1372,7 +1448,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __read, counter, 1);
+					 __perf_counter_read, counter, 1);
 	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 	}
@@ -4050,7 +4126,8 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	struct perf_counter *parent_counter = child_counter->parent;
 	u64 child_val;
 
-	perf_counter_read_event(child_counter, child);
+	if (child_counter->attr.inherit_stat)
+		perf_counter_read_event(child_counter, child);
 
 	child_val = atomic64_read(&child_counter->count);
 
-- 
cgit 


From e6e18ec79b023d5fe84226cef533cf0e3770ce93 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Jun 2009 11:27:12 +0200
Subject: perf_counter: Rework the sample ABI

The PERF_EVENT_READ implementation made me realize we don't
actually need the sample_type int the output sample, since
we already have that in the perf_counter_attr information.

Therefore, remove the PERF_EVENT_MISC_OVERFLOW bit and the
event->type overloading, and imply put counter overflow
samples in a PERF_EVENT_SAMPLE type.

This also fixes the issue that event->type was only 32-bit
and sample_type had 64 usable bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 385ca51c6e60..f2f232696587 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2575,15 +2575,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u32 cpu, reserved;
 	} cpu_entry;
 
-	header.type = 0;
+	header.type = PERF_EVENT_SAMPLE;
 	header.size = sizeof(header);
 
-	header.misc = PERF_EVENT_MISC_OVERFLOW;
+	header.misc = 0;
 	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(data->regs);
-		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
@@ -2592,7 +2591,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
@@ -2602,34 +2600,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_ADDR) {
-		header.type |= PERF_SAMPLE_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR)
 		header.size += sizeof(u64);
-	}
 
-	if (sample_type & PERF_SAMPLE_ID) {
-		header.type |= PERF_SAMPLE_ID;
+	if (sample_type & PERF_SAMPLE_ID)
 		header.size += sizeof(u64);
-	}
 
 	if (sample_type & PERF_SAMPLE_CPU) {
-		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (sample_type & PERF_SAMPLE_PERIOD) {
-		header.type |= PERF_SAMPLE_PERIOD;
+	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
-	}
 
 	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
@@ -2639,10 +2628,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
-
-			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
-		}
+		} else
+			header.size += sizeof(u64);
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2693,8 +2681,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (callchain)
+			perf_output_copy(&handle, callchain, callchain_size);
+		else {
+			u64 nr = 0;
+			perf_output_put(&handle, nr);
+		}
+	}
 
 	perf_output_end(&handle);
 }
-- 
cgit 


From 19d2e755436054dfc2be640bffc32e427c37ac3d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Jun 2009 13:10:23 +0200
Subject: perf_counter: Complete counter swap

Complete the counter swap by indeed switching the times too and
updating the userpage after modifying the counter values.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1246014623.31755.195.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f2f232696587..66ab1e9d1294 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1048,9 +1048,14 @@ static void __perf_counter_sync_stat(struct perf_counter *counter,
 	value = atomic64_xchg(&counter->count, value);
 	atomic64_set(&next_counter->count, value);
 
+	swap(counter->total_time_enabled, next_counter->total_time_enabled);
+	swap(counter->total_time_running, next_counter->total_time_running);
+
 	/*
-	 * XXX also sync time_enabled and time_running ?
+	 * Since we swizzled the values, update the user visible data too.
 	 */
+	perf_counter_update_userpage(counter);
+	perf_counter_update_userpage(next_counter);
 }
 
 #define list_next_entry(pos, member) \
-- 
cgit 


From a32c7765e2796395aec49f699bd25c407155e9c5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 26 Jun 2009 16:55:51 +0800
Subject: tracing: Fix stack tracer sysctl handling

This made my machine completely frozen:

  # echo 1 > /proc/sys/kernel/stack_tracer_enabled
  # echo 2 > /proc/sys/kernel/stack_tracer_enabled

The cause is register_ftrace_function() was called twice.

Also fix ftrace_enabled sysctl, though seems nothing bad happened
as I tested it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A448D17.9010305@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c      | 4 ++--
 kernel/trace/trace_stack.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3716bf04df6..bce9e01a29c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3160,10 +3160,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
-	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
 		goto out;
 
-	last_ftrace_enabled = ftrace_enabled;
+	last_ftrace_enabled = !!ftrace_enabled;
 
 	if (ftrace_enabled) {
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..e644af910124 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -326,10 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
 	if (ret || !write ||
-	    (last_stack_tracer_enabled == stack_tracer_enabled))
+	    (last_stack_tracer_enabled == !!stack_tracer_enabled))
 		goto out;
 
-	last_stack_tracer_enabled = stack_tracer_enabled;
+	last_stack_tracer_enabled = !!stack_tracer_enabled;
 
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
-- 
cgit 


From 82d5308127c3e3404ffbf41e503853c68660b18b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 26 Jun 2009 17:07:02 +0800
Subject: trace_export: Repair missed fields

Some fields for struct ftrace_graph_ret are missed
when they are exported to user.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A448FB6.5000302@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_event_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5e32e375134d..6db005e12487 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		   ftrace_graph_ret_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ret.func, func)
+		TRACE_FIELD(unsigned long long, ret.calltime, calltime)
+		TRACE_FIELD(unsigned long long, ret.rettime, rettime)
+		TRACE_FIELD(unsigned long, ret.overrun, overrun)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
 	TP_RAW_FMT("<-- %lx (%d)")
-- 
cgit 


From 238a24f626628cb16a3015f332d649f08246ca89 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 29 Jun 2009 15:55:10 +0800
Subject: tracing/fastboot: Document the need of initcall_debug

To use boot tracer, one should pass initcall_debug as well as
ftrace=initcall to the command line.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A48735E.9050002@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..019f380fd764 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -226,13 +226,13 @@ config BOOT_TRACER
 	  the timings of the initcalls and traces key events and the identity
 	  of tasks that can cause boot delays, such as context-switches.
 
-	  Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+	  Its aim is to be parsed by the scripts/bootgraph.pl tool to
 	  produce pretty graphics about boot inefficiencies, giving a visual
 	  representation of the delays during initcalls - but the raw
 	  /debug/tracing/trace text output is readable too.
 
-	  You must pass in ftrace=initcall to the kernel command line
-	  to enable this on bootup.
+	  You must pass in initcall_debug and ftrace=initcall to the kernel
+	  command line to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
 	bool
-- 
cgit 


From 12de38b186c2af97bf0b4a1f907f766df46b1def Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 29 Jun 2009 17:13:55 +0100
Subject: kmemleak: Inform kmemleak about pid_hash

Kmemleak does not track alloc_bootmem calls but the pid_hash allocated
in pidhash_init() would need to be scanned as it contains pointers to
struct pid objects.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/pid.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..5fa1db48d8b7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/kmemleak.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -512,6 +513,12 @@ void __init pidhash_init(void)
 	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
 	if (!pid_hash)
 		panic("Could not alloc pidhash!\n");
+	/*
+	 * pid_hash contains references to allocated struct pid objects and it
+	 * must be scanned by kmemleak to avoid false positives.
+	 */
+	kmemleak_alloc(pid_hash, pidhash_size *	sizeof(*(pid_hash)), 0,
+		       GFP_KERNEL);
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
 }
-- 
cgit 


From 57e7986ed142417498155ebcd5eaf617ac37136d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 30 Jun 2009 16:07:19 +1000
Subject: perf_counter: Provide a way to enable counters on exec

This provides a way to mark a counter to be enabled on the next
exec. This is useful for measuring the total activity of a
program without including overhead from the process that
launches it.

This also changes the perf stat command to use this new
facility.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <19017.43927.838745.689203@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 66ab1e9d1294..d55a50da2347 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1428,6 +1428,53 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+	struct perf_counter_context *ctx;
+	struct perf_counter *counter;
+	unsigned long flags;
+	int enabled = 0;
+
+	local_irq_save(flags);
+	ctx = task->perf_counter_ctxp;
+	if (!ctx || !ctx->nr_counters)
+		goto out;
+
+	__perf_counter_task_sched_out(ctx);
+
+	spin_lock(&ctx->lock);
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (!counter->attr.enable_on_exec)
+			continue;
+		counter->attr.enable_on_exec = 0;
+		if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+			continue;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
+		enabled = 1;
+	}
+
+	/*
+	 * Unclone this context if we enabled any counter.
+	 */
+	if (enabled && ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+	local_irq_restore(flags);
+}
+
 /*
  * Cross CPU call to read the hardware counter
  */
@@ -2949,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
+	if (task->perf_counter_ctxp)
+		perf_counter_enable_on_exec(task);
+
 	if (!atomic_read(&nr_comm_counters))
 		return;
 
-- 
cgit 


From 8bc1ad7dd301b7ca7454013519fa92e8c53655ff Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 30 Jun 2009 11:41:31 -0700
Subject: kernel/resource.c: fix sign extension in reserve_setup()

When the 32-bit signed quantities get assigned to the u64 resource_size_t,
they are incorrectly sign-extended.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13253
Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9905

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Reported-by: Leann Ogasawara <leann@ubuntu.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Reported-by: <pablomme@googlemail.com>
Tested-by: <pablomme@googlemail.com>
Cc: <stable@kernel.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
 	static struct resource reserve[MAXRESERVE];
 
 	for (;;) {
-		int io_start, io_num;
+		unsigned int io_start, io_num;
 		int x = reserved;
 
 		if (get_option (&str, &io_start) != 2)
-- 
cgit 


From df279ca8966c3de83105428e3391ab17690802a9 Mon Sep 17 00:00:00 2001
From: Renaud Lottiaux <renaud.lottiaux@kerlabs.com>
Date: Tue, 30 Jun 2009 11:41:34 -0700
Subject: bsdacct: fix access to invalid filp in acct_on()

The file opened in acct_on and freshly stored in the ns->bacct struct can
be closed in acct_file_reopen by a concurrent call after we release
acct_lock and before we call mntput(file->f_path.mnt).

Record file->f_path.mnt in a local variable and use this variable only.

Signed-off-by: Renaud Lottiaux <renaud.lottiaux@kerlabs.com>
Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9f3391090b3e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 static int acct_on(char *name)
 {
 	struct file *file;
+	struct vfsmount *mnt;
 	int error;
 	struct pid_namespace *ns;
 	struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
 		acct = NULL;
 	}
 
-	mnt_pin(file->f_path.mnt);
+	mnt = file->f_path.mnt;
+	mnt_pin(mnt);
 	acct_file_reopen(ns->bacct, file, ns);
 	spin_unlock(&acct_lock);
 
-	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+	mntput(mnt); /* it's pinned, now give up active reference */
 	kfree(acct);
 
 	return 0;
-- 
cgit 


From 4a2bb6fcc80e6330ca2f2393e98605052cc7780b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 30 Jun 2009 17:08:09 -0400
Subject: kprobes: No need to unlock kprobe_insn_mutex

Remove needless kprobe_insn_mutex unlocking during safety check
in garbage collection, because if someone releases a dirty slot
during safety check (which ensures other cpus doesn't execute
all dirty slots), the safety check must be fail. So, we need to
hold the mutex while checking safety.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
LKML-Reference: <20090630210809.17851.28781.stgit@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..16b5739c516a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -237,13 +237,9 @@ static int __kprobes collect_garbage_slots(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos, *next;
-	int safety;
 
 	/* Ensure no-one is preepmted on the garbages */
-	mutex_unlock(&kprobe_insn_mutex);
-	safety = check_safety();
-	mutex_lock(&kprobe_insn_mutex);
-	if (safety != 0)
+	if (check_safety())
 		return -EAGAIN;
 
 	hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
-- 
cgit 


From e1af3aec3e2e7d21d4b3054323779d478c19d907 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 1 Jul 2009 16:50:25 +0800
Subject: tracing: Fix trace_print_seq()

We will lose something if trace_seq->buffer[0] is 0, because the copy length
is calculated by strlen() in seq_puts(), so using seq_write() instead of
seq_puts().

There have a example:
after reboot:

 # echo kmemtrace > current_tracer
 # echo 0 > options/kmem_minimalistic
 # cat trace
 # tracer: kmemtrace
 #
 #

Nothing is exported, because the first byte of trace_seq->buffer[ ]
is KMEMTRACE_USER_ALLOC.

( the value of KMEMTRACE_USER_ALLOC is zero, seeing
  kmemtrace_print_alloc_user() in kernel/trace/kmemtrace.c)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A4B2351.5010300@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..e0c2545622e8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
 	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
 
-	s->buffer[len] = 0;
-	seq_puts(m, s->buffer);
+	seq_write(m, s->buffer, len);
 
 	trace_seq_init(s);
 }
-- 
cgit 


From 5bfd7560979062ad75c9805c1719cec990b5db29 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Sun, 5 Jul 2009 12:08:19 -0700
Subject: Fix virt_to_phys() warnings

These warnings were observed on MIPS32 using 2.6.31-rc1 and gcc-4.2.0:

mm/page_alloc.c: In function 'alloc_pages_exact':
mm/page_alloc.c:1986: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast

drivers/usb/mon/mon_bin.c: In function 'mon_alloc_buff':
drivers/usb/mon/mon_bin.c:1264: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast

[akpm@linux-foundation.org: fix kernel/perf_counter.c too]
Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da2347..a641eb753b8c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2020,7 +2020,7 @@ fail:
 
 static void perf_mmap_free_page(unsigned long addr)
 {
-	struct page *page = virt_to_page(addr);
+	struct page *page = virt_to_page((void *)addr);
 
 	page->mapping = NULL;
 	__free_page(page);
-- 
cgit 


From 793285fcafce4719a05e0c99fa74b188157fe7fe Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 5 Jul 2009 12:08:26 -0700
Subject: cred_guard_mutex: do not return -EINTR to user-space

do_execve() and ptrace_attach() return -EINTR if
mutex_lock_interruptible(->cred_guard_mutex) fails.

This is not right, change the code to return ERESTARTNOINTR.

Perhaps we should also change proc_pid_attr_write().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..082c320e4dbf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
 	 * interference; SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
-	retval = mutex_lock_interruptible(&task->cred_guard_mutex);
-	if (retval < 0)
+	retval = -ERESTARTNOINTR;
+	if (mutex_lock_interruptible(&task->cred_guard_mutex))
 		goto out;
 
 	task_lock(task);
-- 
cgit 


From b43f3cbd21ffbd719fd4fa6642bfe6af255ded34 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 8 Jul 2009 01:54:37 +0400
Subject: headers: mnt_namespace.h redux

Fix various silly problems wrt mnt_namespace.h:

 - exit_mnt_ns() isn't used, remove it
 - done that, sched.h and nsproxy.h inclusions aren't needed
 - mount.h inclusion was need for vfsmount_lock, but no longer
 - remove mnt_namespace.h inclusion from files which don't use anything
   from mnt_namespace.h

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 1 -
 kernel/fork.c | 1 -
 kernel/kmod.c | 1 -
 3 files changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..869dc221733e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/mnt_namespace.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..bd2959228871 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..385c31a1bdbf 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
 #include <linux/unistd.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
-#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-- 
cgit 


From ad361c9884e809340f6daca80d56a9e9c871690a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 6 Jul 2009 13:05:40 -0700
Subject: Remove multiple KERN_ prefixes from printk formats

Commit 5fd29d6ccbc98884569d6f3105aeca70858b3e0f ("printk: clean up
handling of log-levels and newlines") changed printk semantics.  printk
lines with multiple KERN_<level> prefixes are no longer emitted as
before the patch.

<level> is now included in the output on each additional use.

Remove all uses of multiple KERN_<level>s in formats.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..0a049837008e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2451,9 +2451,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 		return ret;
 	}
 	if (ret > 0) {
-		printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
-				    "it should follow 0/-E convention\n"
-		       KERN_WARNING "%s: loading module anyway...\n",
+		printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
 		       __func__, mod->name, ret,
 		       __func__);
 		dump_stack();
-- 
cgit 


From 264ef8a904943ed7d0b04fa958894d7a5c2b2c61 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 7 Jul 2009 10:33:01 +0100
Subject: kmemleak: Remove alloc_bootmem annotations introduced in the past

kmemleak_alloc() calls were added in some places where alloc_bootmem was
called. Since now kmemleak tracks bootmem allocations, these explicit
calls should be run.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/pid.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 5fa1db48d8b7..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,7 +36,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
-#include <linux/kmemleak.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -513,12 +512,6 @@ void __init pidhash_init(void)
 	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
 	if (!pid_hash)
 		panic("Could not alloc pidhash!\n");
-	/*
-	 * pid_hash contains references to allocated struct pid objects and it
-	 * must be scanned by kmemleak to avoid false positives.
-	 */
-	kmemleak_alloc(pid_hash, pidhash_size *	sizeof(*(pid_hash)), 0,
-		       GFP_KERNEL);
 	for (i = 0; i < pidhash_size; i++)
 		INIT_HLIST_HEAD(&pid_hash[i]);
 }
-- 
cgit 


From d86ee4809d0329d4aa0d0f2c76c2295a16862799 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Jul 2009 14:57:57 +0200
Subject: sched: optimize cond_resched()

Optimize cond_resched() by removing one conditional.

Currently cond_resched() checks system_state ==
SYSTEM_RUNNING in order to avoid scheduling before the
scheduler is running.

We can however, as per suggestion of Matt, use
PREEMPT_ACTIVE to accomplish that very same.

Suggested-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e6..01f55ada3598 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6541,6 +6541,11 @@ SYSCALL_DEFINE0(sched_yield)
 	return 0;
 }
 
+static inline int should_resched(void)
+{
+	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6560,8 +6565,7 @@ static void __cond_resched(void)
 
 int __sched _cond_resched(void)
 {
-	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
-					system_state == SYSTEM_RUNNING) {
+	if (should_resched()) {
 		__cond_resched();
 		return 1;
 	}
@@ -6579,12 +6583,12 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int cond_resched_lock(spinlock_t *lock)
 {
-	int resched = need_resched() && system_state == SYSTEM_RUNNING;
+	int resched = should_resched();
 	int ret = 0;
 
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
-		if (resched && need_resched())
+		if (resched)
 			__cond_resched();
 		else
 			cpu_relax();
@@ -6599,7 +6603,7 @@ int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	if (need_resched() && system_state == SYSTEM_RUNNING) {
+	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
-- 
cgit 


From 405f55712dfe464b3240d7816cc4fe4174831be2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 11 Jul 2009 22:08:37 +0400
Subject: headers: smp_lock.h redux

* Remove smp_lock.h from files which don't need it (including some headers!)
* Add smp_lock.h to files which do need it
* Make smp_lock.h include conditional in hardirq.h
  It's needed only for one kernel_locked() usage which is under CONFIG_PREEMPT

  This will make hardirq.h inclusion cheaper for every PREEMPT=n config
  (which includes allmodconfig/allyesconfig, BTW)

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/user.c     | 1 -
 kernel/trace/blktrace.c | 1 +
 kernel/trace/trace.c    | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <linux/smp_lock.h>
 #include <scsi/scsi_scan.h>
 
 #include <asm/uaccess.h>
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..1090b0aed9ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
+#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3aa0a0dfdfa8..8bc8d8afea6a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
-- 
cgit