From 505d9147a72d4e14323af9581dde066bd5fc439c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Thu, 21 Apr 2011 15:37:20 -0700
Subject: sparc32: fix section mismatch warnings in apc, pmc and time_32

In all cases there were a struct of_device_id variable defined __initdata.
But it was referenced from struct platform_driver.of_match_table
which is not guaranteed to be used during init only.

So drop the __initdata annotation.

This fixes following warnings:

WARNING: arch/sparc/kernel/built-in.o(.data+0x810): Section mismatch in reference from the variable clock_driver to the variable .init.data:clock_match
The variable clock_driver references
the variable __initdata clock_match
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

WARNING: arch/sparc/kernel/built-in.o(.data+0xcec): Section mismatch in reference from the variable apc_driver to the variable .init.data:apc_match
The variable apc_driver references
the variable __initdata apc_match
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

WARNING: arch/sparc/kernel/built-in.o(.data+0xd60): Section mismatch in reference from the variable pmc_driver to the variable .init.data:pmc_match
The variable pmc_driver references
the variable __initdata pmc_match
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/apc.c     | 2 +-
 arch/sparc/kernel/pmc.c     | 2 +-
 arch/sparc/kernel/time_32.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c
index f679c57644d5..1e34f29e58bb 100644
--- a/arch/sparc/kernel/apc.c
+++ b/arch/sparc/kernel/apc.c
@@ -165,7 +165,7 @@ static int __devinit apc_probe(struct platform_device *op)
 	return 0;
 }
 
-static struct of_device_id __initdata apc_match[] = {
+static struct of_device_id apc_match[] = {
 	{
 		.name = APC_OBPNAME,
 	},
diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c
index 93d7b4465f8d..6a585d393580 100644
--- a/arch/sparc/kernel/pmc.c
+++ b/arch/sparc/kernel/pmc.c
@@ -69,7 +69,7 @@ static int __devinit pmc_probe(struct platform_device *op)
 	return 0;
 }
 
-static struct of_device_id __initdata pmc_match[] = {
+static struct of_device_id pmc_match[] = {
 	{
 		.name = PMC_OBPNAME,
 	},
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 4e236391b635..96046a4024c2 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -168,7 +168,7 @@ static int __devinit clock_probe(struct platform_device *op)
 	return 0;
 }
 
-static struct of_device_id __initdata clock_match[] = {
+static struct of_device_id clock_match[] = {
 	{
 		.name = "eeprom",
 	},
-- 
cgit 


From f486b3dc2d048e7309a733f97eb9f9f83d586df2 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Thu, 21 Apr 2011 16:35:46 -0700
Subject: sparc32: fix sparcstation 5 boot

The sparcstation 5 I have available has no MID property for the CPU.
This resulted in a panic when booting a SMP kernel on this box.

The assigned field in cpu_data is never used, so if we fail
to read the MID property then inform user and continue booting.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_32.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 91c10fb70858..850a1360c0d6 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -53,6 +53,7 @@ cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 void __cpuinit smp_store_cpu_info(int id)
 {
 	int cpu_node;
+	int mid;
 
 	cpu_data(id).udelay_val = loops_per_jiffy;
 
@@ -60,10 +61,13 @@ void __cpuinit smp_store_cpu_info(int id)
 	cpu_data(id).clock_tick = prom_getintdefault(cpu_node,
 						     "clock-frequency", 0);
 	cpu_data(id).prom_node = cpu_node;
-	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
+	mid = cpu_get_hwmid(cpu_node);
 
-	if (cpu_data(id).mid < 0)
-		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
+	if (mid < 0) {
+		printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08d", id, cpu_node);
+		mid = 0;
+	}
+	cpu_data(id).mid = mid;
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
-- 
cgit 


From 1d44e8288a0557c28c447d7e511f50d06ff93a34 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 9 May 2011 11:35:19 -0500
Subject: x86, UV: Fix NMI handler for UV platforms

This fixes problems seen on UV systems handling NMIs from the
node controller.

I isolated the "dazed..." messages that I saw earlier to a bug in
the BMC on our platform. It was sending NMIs w/o properly setting
a register that indicated the source of NMI.

So rather than _assuming_ any unhandled NMI came from the UV system
maintenance console (SMC), add a check to verify that the SMC actually
sent the NMI.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: gorcunov@gmail.com
Cc: dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h   |  2 ++
 arch/x86/include/asm/uv/uv_mmrs.h  | 16 ++++++++++++-
 arch/x86/kernel/apic/x2apic_uv_x.c | 48 ++++++++++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a501741c2335..4298002d0c83 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -398,6 +398,8 @@ struct uv_blade_info {
 	unsigned short	nr_online_cpus;
 	unsigned short	pnode;
 	short		memory_nid;
+	spinlock_t	nmi_lock;
+	unsigned long	nmi_count;
 };
 extern struct uv_blade_info *uv_blade_info;
 extern short *uv_node_to_blade;
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 20cafeac7455..f5bb64a823d7 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
  *
  * SGI UV MMR definitions
  *
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_MMRS_H
@@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                               UVH_SCRATCH5                                */
+/* ========================================================================= */
+#define UVH_SCRATCH5 0x2d0200UL
+#define UVH_SCRATCH5_32 0x00778
+
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+union uvh_scratch5_u {
+    unsigned long	v;
+    struct uvh_scratch5_s {
+	unsigned long	scratch5 : 64;  /* RW, W1CS */
+    } s;
+};
 
 #endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 33b10a0fc095..7acd2d2ac965 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -37,6 +37,13 @@
 #include <asm/smp.h>
 #include <asm/x86_init.h>
 #include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR				UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR			(UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK			(1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
@@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void)
  */
 int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
+	unsigned long real_uv_nmi;
+	int bid;
+
 	if (reason != DIE_NMIUNKNOWN)
 		return NOTIFY_OK;
 
 	if (in_crash_kexec)
 		/* do nothing if entering the crash kernel */
 		return NOTIFY_OK;
+
 	/*
-	 * Use a lock so only one cpu prints at a time
-	 * to prevent intermixed output.
+	 * Each blade has an MMR that indicates when an NMI has been sent
+	 * to cpus on the blade. If an NMI is detected, atomically
+	 * clear the MMR and update a per-blade NMI count used to
+	 * cause each cpu on the blade to notice a new NMI.
+	 */
+	bid = uv_numa_blade_id();
+	real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+
+	if (unlikely(real_uv_nmi)) {
+		spin_lock(&uv_blade_info[bid].nmi_lock);
+		real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+		if (real_uv_nmi) {
+			uv_blade_info[bid].nmi_count++;
+			uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+		}
+		spin_unlock(&uv_blade_info[bid].nmi_lock);
+	}
+
+	if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+		return NOTIFY_DONE;
+
+	__get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
+
+	/*
+	 * Use a lock so only one cpu prints at a time.
+	 * This prevents intermixed output.
 	 */
 	spin_lock(&uv_nmi_lock);
-	pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+	pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
 	dump_stack();
 	spin_unlock(&uv_nmi_lock);
 
@@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 }
 
 static struct notifier_block uv_dump_stack_nmi_nb = {
-	.notifier_call	= uv_handle_nmi
+	.notifier_call	= uv_handle_nmi,
+	.priority = NMI_LOCAL_LOW_PRIOR - 1,
 };
 
 void uv_register_nmi_notifier(void)
@@ -720,8 +756,9 @@ void __init uv_system_init(void)
 	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
-	uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+	uv_blade_info = kzalloc(bytes, GFP_KERNEL);
 	BUG_ON(!uv_blade_info);
+
 	for (blade = 0; blade < uv_num_possible_blades(); blade++)
 		uv_blade_info[blade].memory_nid = -1;
 
@@ -747,6 +784,7 @@ void __init uv_system_init(void)
 			uv_blade_info[blade].pnode = pnode;
 			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
+			spin_lock_init(&uv_blade_info[blade].nmi_lock);
 			max_pnode = max(pnode, max_pnode);
 			blade++;
 		}
-- 
cgit 


From 9bbeacf52f66d165739a4bbe9c018d17493a74b5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 11 May 2011 13:06:13 +0200
Subject: kprobes, x86: Disable irqs during optimized callback

Disable irqs during optimized callback, so we dont miss any in-irq kprobes.

The following commands:

 # cd /debug/tracing/
 # echo "p mutex_unlock" >> kprobe_events
 # echo "p _raw_spin_lock" >> kprobe_events
 # echo "p smp_apic_timer_interrupt" >> ./kprobe_events
 # echo 1 > events/enable

Cause the optimized kprobes to be missed. None is missed
with the fix applied.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/20110511110613.GB2390@jolsa.brq.redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c969fd9d1566..f1a6244d7d93 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
 					 struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	unsigned long flags;
 
 	/* This is possible if op is under delayed unoptimizing */
 	if (kprobe_disabled(&op->kp))
 		return;
 
-	preempt_disable();
+	local_irq_save(flags);
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(&op->kp);
 	} else {
@@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
 		opt_pre_handler(&op->kp, regs);
 		__this_cpu_write(current_kprobe, NULL);
 	}
-	preempt_enable_no_resched();
+	local_irq_restore(flags);
 }
 
 static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-- 
cgit 


From b1054282d752c5a026e2c0450616ebf37fc0413e Mon Sep 17 00:00:00 2001
From: Tkhai Kirill <tkhai@yandex.ru>
Date: Tue, 10 May 2011 02:31:41 +0000
Subject: sparc32: Fixed unaligned memory copying in function
 __csum_partial_copy_sparc_generic

When we are in the label cc_dword_align, registers %o0 and %o1 have the same last 2 bits,
but it's not guaranteed one of them is zero. So we can get unaligned memory access
in label ccte. Example of parameters which lead to this:
%o0=0x7ff183e9, %o1=0x8e709e7d, %g1=3

With the parameters I had a memory corruption, when the additional 5 bytes were rewritten.
This patch corrects the error.

One comment to the patch. We don't care about the third bit in %o1, because cc_end_cruft
stores word or less.

Signed-off-by: Tkhai Kirill <tkhai@yandex.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/checksum_32.S | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S
index 3632cb34e914..0084c3361e15 100644
--- a/arch/sparc/lib/checksum_32.S
+++ b/arch/sparc/lib/checksum_32.S
@@ -289,10 +289,16 @@ cc_end_cruft:
 
 	/* Also, handle the alignment code out of band. */
 cc_dword_align:
-	cmp	%g1, 6
-	bl,a	ccte
+	cmp	%g1, 16
+	bge	1f
+	 srl	%g1, 1, %o3
+2:	cmp	%o3, 0
+	be,a	ccte
 	 andcc	%g1, 0xf, %o3
-	andcc	%o0, 0x1, %g0
+	andcc	%o3, %o0, %g0	! Check %o0 only (%o1 has the same last 2 bits)
+	be,a	2b
+	 srl	%o3, 1, %o3
+1:	andcc	%o0, 0x1, %g0
 	bne	ccslow
 	 andcc	%o0, 0x2, %g0
 	be	1f
-- 
cgit 


From 92bdaef7b2c5d3cb8abc902faa1f7670a183dcdc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 5 May 2011 13:50:43 -0400
Subject: Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table
 high""

This reverts commit a38647837a411f7df79623128421eef2118b5884.

It does not work with certain AMD machines.

last_pfn = 0x100000 max_arch_pfn = 0x400000000
initial memory mapped : 0 - 02c3a000
Base memory trampoline at [ffff88000009b000] 9b000 size 20480
init_memory_mapping: 0000000000000000-0000000100000000
 0000000000 - 0100000000 page 4k
kernel direct mapping tables up to 100000000 @ ff7fb000-100000000
init_memory_mapping: 0000000100000000-00000001e0800000
 0100000000 - 01e0800000 page 4k
kernel direct mapping tables up to 1e0800000 @ 1df0f3000-1e0000000
xen: setting RW the range fffdc000 - 100000000
RAMDISK: 0203b000 - 02c3a000
No NUMA configuration found
Faking a node at 0000000000000000-00000001e0800000
NUMA: Using 63 for the hash shift.
Initmem setup node 0 0000000000000000-00000001e0800000
  NODE_DATA [00000001dfffb000 - 00000001dfffffff]
BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
PGD 0
Oops: 0003 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Not tainted 2.6.39-0-virtual #6~smb1
RIP: e030:[<ffffffff81cf6a75>]  [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
RSP: e02b:ffffffff81c01e38  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 00000001e0800000 RCX: 0000000000001040
RDX: 0000000000004100 RSI: 0000000000000000 RDI: ffff8801dfffb000
RBP: ffffffff81c01e58 R08: 0000000000000020 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000bfe400
FS:  0000000000000000(0000) GS:ffffffff81cca000(0000) knlGS:0000000000000000
CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000001c03000 CR4: 0000000000000660
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81c00000, task ffffffff81c0b020)
Stack:
 0000000000000040 0000000000000001 0000000000000000 ffffffffffffffff
 ffffffff81c01e88 ffffffff81cf6c25 0000000000000000 0000000000000000
 ffffffff81cf687f 0000000000000000 ffffffff81c01ea8 ffffffff81cf6e45
Call Trace:
 [<ffffffff81cf6c25>] numa_register_memblks.constprop.3+0x150/0x181
 [<ffffffff81cf687f>] ? numa_add_memblk+0x7c/0x7c
 [<ffffffff81cf6e45>] numa_init.part.2+0x1c/0x7c
 [<ffffffff81cf687f>] ? numa_add_memblk+0x7c/0x7c
 [<ffffffff81cf6f67>] numa_init+0x6c/0x70
 [<ffffffff81cf7057>] initmem_init+0x39/0x3b
 [<ffffffff81ce5865>] setup_arch+0x64e/0x769
 [<ffffffff815e43c1>] ? printk+0x51/0x53
 [<ffffffff81cdf92b>] start_kernel+0xd4/0x3f3
 [<ffffffff81cdf388>] x86_64_start_reservations+0x132/0x136
 [<ffffffff81ce2ed4>] xen_start_kernel+0x588/0x58f
Code: 41 00 00 48 8b 3c c5 a0 24 cc 81 31 c0 40 f6 c7 01 74 05 aa 66 ba ff 40 40 f6 c7 02 74 05 66 ab 83 ea 02 89 d1 c1 e9 02 f6 c2 02 <f3> ab 74 02 66 ab 80 e2 01 74 01 aa 49 63 c4 48 c1 eb 0c 44 89
RIP  [<ffffffff81cf6a75>] setup_node_bootmem+0x18a/0x1ea
 RSP <ffffffff81c01e38>
CR2: 0000000000000000
---[ end trace a7919e7f17c0a725 ]---
Kernel panic - not syncing: Attempted to kill the idle task!
Pid: 0, comm: swapper Tainted: G      D     2.6.39-0-virtual #6~smb1

Reported-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 123 -----------------------------------------------------
 1 file changed, 123 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 55c965b38c27..cf4ef61e425b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1463,119 +1463,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
 
-#ifdef CONFIG_X86_64
-static __initdata u64 __last_pgt_set_rw = 0;
-static __initdata u64 __pgt_buf_start = 0;
-static __initdata u64 __pgt_buf_end = 0;
-static __initdata u64 __pgt_buf_top = 0;
-/*
- * As a consequence of the commit:
- * 
- * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
- * Author: Yinghai Lu <yinghai@kernel.org>
- * Date:   Fri Dec 17 16:58:28 2010 -0800
- * 
- *     x86-64, mm: Put early page table high
- * 
- * at some point init_memory_mapping is going to reach the pagetable pages
- * area and map those pages too (mapping them as normal memory that falls
- * in the range of addresses passed to init_memory_mapping as argument).
- * Some of those pages are already pagetable pages (they are in the range
- * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
- * everything is fine.
- * Some of these pages are not pagetable pages yet (they fall in the range
- * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
- * are going to be mapped RW.  When these pages become pagetable pages and
- * are hooked into the pagetable, xen will find that the guest has already
- * a RW mapping of them somewhere and fail the operation.
- * The reason Xen requires pagetables to be RO is that the hypervisor needs
- * to verify that the pagetables are valid before using them. The validation
- * operations are called "pinning".
- * 
- * In order to fix the issue we mark all the pages in the entire range
- * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
- * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
- * init_memory_mapping. Hence the kernel is going to crash as soon as one
- * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
- * ranges are RO).
- * 
- * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
- * the init_memory_mapping has completed (in a perfect world we would
- * call this function from init_memory_mapping, but lets ignore that).
- * 
- * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
- * end,top] have all changed to new values (b/c init_memory_mapping
- * is called and setting up another new page-table). Hence, the first time
- * we enter this function, we save away the pgt_buf_start value and update
- * the pgt_buf_[end,top].
- * 
- * When we detect that the "old" pgt_buf_start through pgt_buf_end
- * PFNs have been reserved (so memblock_x86_reserve_range has been called),
- * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
- * 
- * And then we update those "old" pgt_buf_[end|top] with the new ones
- * so that we can redo this on the next pagetable.
- */
-static __init void mark_rw_past_pgt(void) {
-
-	if (pgt_buf_end > pgt_buf_start) {
-		u64 addr, size;
-
-		/* Save it away. */
-		if (!__pgt_buf_start) {
-			__pgt_buf_start = pgt_buf_start;
-			__pgt_buf_end = pgt_buf_end;
-			__pgt_buf_top = pgt_buf_top;
-			return;
-		}
-		/* If we get the range that starts at __pgt_buf_end that means
-		 * the range is reserved, and that in 'init_memory_mapping'
-		 * the 'memblock_x86_reserve_range' has been called with the
-		 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
-		 * pgt_buf_[start|end|top] refer now to a new pagetable.
-		 * Note: we are called _after_ the pgt_buf_[..] have been
-		 * updated.*/
-
-		addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
-						       &size, PAGE_SIZE);
-
-		/* Still not reserved, meaning 'memblock_x86_reserve_range'
-		 * hasn't been called yet. Update the _end and _top.*/
-		if (addr == PFN_PHYS(__pgt_buf_start)) {
-			__pgt_buf_end = pgt_buf_end;
-			__pgt_buf_top = pgt_buf_top;
-			return;
-		}
-
-		/* OK, the area is reserved, meaning it is time for us to
-		 * set RW for the old end->top PFNs. */
-
-		/* ..unless we had already done this. */
-		if (__pgt_buf_end == __last_pgt_set_rw)
-			return;
-
-		addr = PFN_PHYS(__pgt_buf_end);
-		
-		/* set as RW the rest */
-		printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
-			PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
-		
-		while (addr < PFN_PHYS(__pgt_buf_top)) {
-			make_lowmem_page_readwrite(__va(addr));
-			addr += PAGE_SIZE;
-		}
-		/* And update everything so that we are ready for the next
-		 * pagetable (the one created for regions past 4GB) */
-		__last_pgt_set_rw = __pgt_buf_end;
-		__pgt_buf_start = pgt_buf_start;
-		__pgt_buf_end = pgt_buf_end;
-		__pgt_buf_top = pgt_buf_top;
-	}
-	return;
-}
-#else
-static __init void mark_rw_past_pgt(void) { }
-#endif
 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 #ifdef CONFIG_X86_64
@@ -1601,14 +1488,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	/*
-	 * A bit of optimization. We do not need to call the workaround
-	 * when xen_set_pte_init is called with a PTE with 0 as PFN.
-	 * That is b/c the pagetable at that point are just being populated
-	 * with empty values and we can save some cycles by not calling
-	 * the 'memblock' code.*/
-	if (pfn)
-		mark_rw_past_pgt();
 	/*
 	 * If the new pfn is within the range of the newly allocated
 	 * kernel pagetable, and it isn't being mapped into an
@@ -2118,8 +1997,6 @@ __init void xen_ident_map_ISA(void)
 
 static __init void xen_post_allocator_init(void)
 {
-	mark_rw_past_pgt();
-
 #ifdef CONFIG_XEN_DEBUG
 	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
 #endif
-- 
cgit 


From 279b706bf800b5967037f492dbe4fc5081ad5d0f Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 14 Apr 2011 15:49:41 +0100
Subject: x86,xen: introduce x86_init.mapping.pagetable_reserve

Introduce a new x86_init hook called pagetable_reserve that at the end
of init_memory_mapping is used to reserve a range of memory addresses for
the kernel pagetable pages we used and free the other ones.

On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.

A detailed explanation of the reason why this hook is needed follows.

As a consequence of the commit:

commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
Author: Yinghai Lu <yinghai@kernel.org>
Date:   Fri Dec 17 16:58:28 2010 -0800

    x86-64, mm: Put early page table high

at some point init_memory_mapping is going to reach the pagetable pages
area and map those pages too (mapping them as normal memory that falls
in the range of addresses passed to init_memory_mapping as argument).
Some of those pages are already pagetable pages (they are in the range
pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW.  When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
The reason Xen requires pagetables to be RO is that the hypervisor needs
to verify that the pagetables are valid before using them. The validation
operations are called "pinning" (more details in arch/x86/xen/mmu.c).

In order to fix the issue we mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping. Hence the kernel is going to crash as soon as one
of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
ranges are RO).

For this reason we need a hook to reserve the kernel pagetable pages we
used and free the other ones so that they can be reused for other
purposes.
On native it just means calling memblock_x86_reserve_range, on Xen it
also means marking RW the pagetable pages that we allocated before but
that haven't been used before.

Another way to fix this is without using the hook is by adding a 'if
(xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen
counterpart, but that is just nasty.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/pgtable_types.h |  1 +
 arch/x86/include/asm/x86_init.h      | 12 ++++++++++++
 arch/x86/kernel/x86_init.c           |  4 ++++
 arch/x86/mm/init.c                   | 24 ++++++++++++++++++++++--
 arch/x86/xen/mmu.c                   | 15 +++++++++++++++
 5 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723d1f32..d56187c6b838 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
+extern void native_pagetable_reserve(u64 start, u64 end);
 #ifdef CONFIG_X86_32
 extern void native_pagetable_setup_start(pgd_t *base);
 extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2e2ad8..d3d859035af9 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -67,6 +67,17 @@ struct x86_init_oem {
 	void (*banner)(void);
 };
 
+/**
+ * struct x86_init_mapping - platform specific initial kernel pagetable setup
+ * @pagetable_reserve:	reserve a range of addresses for kernel pagetable usage
+ *
+ * For more details on the purpose of this hook, look in
+ * init_memory_mapping and the commit that added it.
+ */
+struct x86_init_mapping {
+	void (*pagetable_reserve)(u64 start, u64 end);
+};
+
 /**
  * struct x86_init_paging - platform specific paging functions
  * @pagetable_setup_start:	platform specific pre paging_init() call
@@ -123,6 +134,7 @@ struct x86_init_ops {
 	struct x86_init_mpparse		mpparse;
 	struct x86_init_irqs		irqs;
 	struct x86_init_oem		oem;
+	struct x86_init_mapping		mapping;
 	struct x86_init_paging		paging;
 	struct x86_init_timers		timers;
 	struct x86_init_iommu		iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128b..75ef4b18e9b7 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
 		.banner			= default_banner,
 	},
 
+	.mapping = {
+		.pagetable_reserve		= native_pagetable_reserve,
+	},
+
 	.paging = {
 		.pagetable_setup_start	= native_pagetable_setup_start,
 		.pagetable_setup_done	= native_pagetable_setup_done,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289b039b..722a4c372ce3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
 }
 
+void native_pagetable_reserve(u64 start, u64 end)
+{
+	memblock_x86_reserve_range(start, end, "PGTABLE");
+}
+
 struct map_range {
 	unsigned long start;
 	unsigned long end;
@@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	__flush_tlb_all();
 
+	/*
+	 * Reserve the kernel pagetable pages we used (pgt_buf_start -
+	 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+	 * so that they can be reused for other purposes.
+	 *
+	 * On native it just means calling memblock_x86_reserve_range, on Xen it
+	 * also means marking RW the pagetable pages that we allocated before
+	 * but that haven't been used.
+	 *
+	 * In fact on xen we mark RO the whole range pgt_buf_start -
+	 * pgt_buf_top, because we have to make sure that when
+	 * init_memory_mapping reaches the pagetable pages area, it maps
+	 * RO all the pagetable pages, including the ones that are beyond
+	 * pgt_buf_end at that time.
+	 */
 	if (!after_bootmem && pgt_buf_end > pgt_buf_start)
-		memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
-				 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
+		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+				PFN_PHYS(pgt_buf_end));
 
 	if (!after_bootmem)
 		early_memtest(start, end);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cf4ef61e425b..0684f3c74d53 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 {
 }
 
+static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
+{
+	/* reserve the range used */
+	native_pagetable_reserve(start, end);
+
+	/* set as RW the rest */
+	printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
+			PFN_PHYS(pgt_buf_top));
+	while (end < PFN_PHYS(pgt_buf_top)) {
+		make_lowmem_page_readwrite(__va(end));
+		end += PAGE_SIZE;
+	}
+}
+
 static void xen_post_allocator_init(void);
 
 static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -2105,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 void __init xen_init_mmu_ops(void)
 {
+	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
 	pv_mmu_ops = xen_mmu_ops;
-- 
cgit 


From 53f8023febf9b3e18d8fb0d99c55010e473ce53d Mon Sep 17 00:00:00 2001
From: Sedat Dilek <sedat.dilek@gmail.com>
Date: Sun, 17 Apr 2011 16:17:34 +0200
Subject: x86/mm: Fix section mismatch derived from native_pagetable_reserve()

With CONFIG_DEBUG_SECTION_MISMATCH=y I see these warnings in next-20110415:

  LD      vmlinux.o
  MODPOST vmlinux.o
WARNING: vmlinux.o(.text+0x1ba48): Section mismatch in reference from the function native_pagetable_reserve() to the function .init.text:memblock_x86_reserve_range()
The function native_pagetable_reserve() references
the function __init memblock_x86_reserve_range().
This is often because native_pagetable_reserve lacks a __init
annotation or the annotation of memblock_x86_reserve_range is wrong.

This patch fixes the issue.
Thanks to pipacs from PaX project for help on IRC.

Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 722a4c372ce3..37b8b0fe8320 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,7 +81,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
 }
 
-void native_pagetable_reserve(u64 start, u64 end)
+void __init native_pagetable_reserve(u64 start, u64 end)
 {
 	memblock_x86_reserve_range(start, end, "PGTABLE");
 }
-- 
cgit 


From 77ed23f8d995a01cd8101d84351b567bf5177a30 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Tue, 10 May 2011 08:26:43 -0500
Subject: x86: Fix UV BAU for non-consecutive nasids

This is a fix for the SGI Altix-UV Broadcast Assist Unit code,
which is used for TLB flushing.

Certain hardware configurations (that customers are ordering)
cause nasids (numa address space id's) to be non-consecutive.
Specifically, once you have more than 4 blades in a IRU
(Individual Rack Unit - or 1/2 rack) but less than the maximum
of 16, the nasid numbering becomes non-consecutive.  This
currently results in a 'catastrophic error' (CATERR) detected by
the firmware during OS boot.  The BAU is generating an 'INTD'
request that is targeting a non-existent nasid value. Such
configurations may also occur when a blade is configured off
because of hardware errors. (There is one UV hub per blade.)

This patch is required to support such configurations.

The problem with the tlb_uv.c code is that is using the
consecutive hub numbers as indices to the BAU distribution bit
map. These are simply the ordinal position of the hub or blade
within its partition.  It should be using physical node numbers
(pnodes), which correspond to the physical nasid values. Use of
the hub number only works as long as the nasids in the partition
are consecutive and increase with a stride of 1.

This patch changes the index to be the pnode number, thus
allowing nasids to be non-consecutive.
It also provides a table in local memory for each cpu to
translate target cpu number to target pnode and nasid.
And it improves naming to properly reflect 'node' and 'uvhub'
versus 'nasid'.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/E1QJmxX-0002Mz-Fk@eag09.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h | 17 ++++++--
 arch/x86/platform/uv/tlb_uv.c    | 92 +++++++++++++++++++++++++++-------------
 2 files changed, 76 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 3e094af443c3..130f1eeee5fe 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -94,6 +94,8 @@
 /* after this # consecutive successes, bump up the throttle if it was lowered */
 #define COMPLETE_THRESHOLD 5
 
+#define UV_LB_SUBNODEID 0x10
+
 /*
  * number of entries in the destination side payload queue
  */
@@ -124,7 +126,7 @@
  * The distribution specification (32 bytes) is interpreted as a 256-bit
  * distribution vector. Adjacent bits correspond to consecutive even numbered
  * nodeIDs. The result of adding the index of a given bit to the 15-bit
- * 'base_dest_nodeid' field of the header corresponds to the
+ * 'base_dest_nasid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
 struct bau_target_uvhubmask {
@@ -176,7 +178,7 @@ struct bau_msg_payload {
 struct bau_msg_header {
 	unsigned int dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
-	unsigned int base_dest_nodeid:15; /* nasid of the */
+	unsigned int base_dest_nasid:15; /* nasid of the */
 	/* bits 20:6 */			  /* first bit in uvhub map */
 	unsigned int command:8;	/* message type */
 	/* bits 28:21 */
@@ -378,6 +380,10 @@ struct ptc_stats {
 	unsigned long d_rcanceled; /* number of messages canceled by resets */
 };
 
+struct hub_and_pnode {
+	short uvhub;
+	short pnode;
+};
 /*
  * one per-cpu; to locate the software tables
  */
@@ -399,10 +405,12 @@ struct bau_control {
 	int baudisabled;
 	int set_bau_off;
 	short cpu;
+	short osnode;
 	short uvhub_cpu;
 	short uvhub;
 	short cpus_in_socket;
 	short cpus_in_uvhub;
+	short partition_base_pnode;
 	unsigned short message_number;
 	unsigned short uvhub_quiesce;
 	short socket_acknowledge_count[DEST_Q_SIZE];
@@ -422,15 +430,16 @@ struct bau_control {
 	int congested_period;
 	cycles_t period_time;
 	long period_requests;
+	struct hub_and_pnode *target_hub_and_pnode;
 };
 
 static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
 	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
 {
-	__set_bit(uvhub, &dstp->bits[0]);
+	__set_bit(pnode, &dstp->bits[0]);
 }
 static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
 				    int nbits)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 7cb6424317f6..c58e0ea39ef5 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	int tcpu;
-	int uvhub;
 	int locals = 0;
 	int remotes = 0;
 	int hubs = 0;
+	int tcpu;
+	int tpnode;
 	struct bau_desc *bau_desc;
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
 	struct bau_control *tbcp;
+	struct hub_and_pnode *hpp;
 
 	/* kernel was booted 'nobau' */
 	if (nobau)
@@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 
-	/* cpu statistics */
 	for_each_cpu(tcpu, flush_mask) {
-		uvhub = uv_cpu_to_blade_id(tcpu);
-		bau_uvhub_set(uvhub, &bau_desc->distribution);
-		if (uvhub == bcp->uvhub)
+		/*
+		 * The distribution vector is a bit map of pnodes, relative
+		 * to the partition base pnode (and the partition base nasid
+		 * in the header).
+		 * Translate cpu to pnode and hub using an array stored
+		 * in local memory.
+		 */
+		hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
+		tpnode = hpp->pnode - bcp->partition_base_pnode;
+		bau_uvhub_set(tpnode, &bau_desc->distribution);
+		if (hpp->uvhub == bcp->uvhub)
 			locals++;
 		else
 			remotes++;
@@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
-static void uv_enable_timeouts(void)
+static void __init uv_enable_timeouts(void)
 {
 	int uvhub;
 	int nuvhubs;
@@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void)
 }
 
 /*
- * initialize the sending side's sending buffers
+ * Initialize the sending side's sending buffers.
  */
 static void
-uv_activation_descriptor_init(int node, int pnode)
+uv_activation_descriptor_init(int node, int pnode, int base_pnode)
 {
 	int i;
 	int cpu;
@@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode)
 	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
+	/* the 14-bit pnode */
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
 			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
 	/*
-	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+	 * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
 	 * describe a broadcast to 256 uv hubs.
 	 */
@@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode)
 		memset(bd2, 0, sizeof(struct bau_desc));
 		bd2->header.sw_ack_flag = 1;
 		/*
-		 * base_dest_nodeid is the nasid of the first uvhub
-		 * in the partition. The bit map will indicate uvhub numbers,
-		 * which are 0-N in a partition. Pnodes are unique system-wide.
+		 * The base_dest_nasid set in the message header is the nasid
+		 * of the first uvhub in the partition. The bit map will
+		 * indicate destination pnode numbers relative to that base.
+		 * They may not be consecutive if nasid striding is being used.
 		 */
-		bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
-		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
+		bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
 		bd2->header.command = UV_NET_ENDPOINT_INTD;
 		bd2->header.int_both = 1;
 		/*
@@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode)
 /*
  * Initialization of each UV hub's structures
  */
-static void __init uv_init_uvhub(int uvhub, int vector)
+static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
 {
 	int node;
 	int pnode;
@@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector)
 
 	node = uvhub_to_first_node(uvhub);
 	pnode = uv_blade_to_pnode(uvhub);
-	uv_activation_descriptor_init(node, pnode);
+	uv_activation_descriptor_init(node, pnode, base_pnode);
 	uv_payload_queue_init(node, pnode);
 	/*
-	 * the below initialization can't be in firmware because the
-	 * messaging IRQ will be determined by the OS
+	 * The below initialization can't be in firmware because the
+	 * messaging IRQ will be determined by the OS.
 	 */
 	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -1491,10 +1500,11 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static int __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
 {
 	int i;
 	int cpu;
+	int tcpu;
 	int pnode;
 	int uvhub;
 	int have_hmaster;
@@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs)
 		bcp = &per_cpu(bau_control, cpu);
 		memset(bcp, 0, sizeof(struct bau_control));
 		pnode = uv_cpu_hub_info(cpu)->pnode;
+		if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
+			printk(KERN_EMERG
+				"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
+				cpu, pnode, base_part_pnode,
+				UV_DISTRIBUTION_SIZE);
+			return 1;
+		}
+		bcp->osnode = cpu_to_node(cpu);
+		bcp->partition_base_pnode = uv_partition_base_pnode;
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
 		bdp = &uvhub_descs[uvhub];
@@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs)
 		bdp->pnode = pnode;
 		/* kludge: 'assuming' one node per socket, and assuming that
 		   disabling a socket just leaves a gap in node numbers */
-		socket = (cpu_to_node(cpu) & 1);
+		socket = bcp->osnode & 1;
 		bdp->socket_mask |= (1 << socket);
 		sdp = &bdp->socket[socket];
 		sdp->cpu_number[sdp->num_cpus] = cpu;
@@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs)
 nextsocket:
 			socket++;
 			socket_mask = (socket_mask >> 1);
+			/* each socket gets a local array of pnodes/hubs */
+			bcp = smaster;
+			bcp->target_hub_and_pnode = kmalloc_node(
+				sizeof(struct hub_and_pnode) *
+				num_possible_cpus(), GFP_KERNEL, bcp->osnode);
+			memset(bcp->target_hub_and_pnode, 0,
+				sizeof(struct hub_and_pnode) *
+				num_possible_cpus());
+			for_each_present_cpu(tcpu) {
+				bcp->target_hub_and_pnode[tcpu].pnode =
+					uv_cpu_hub_info(tcpu)->pnode;
+				bcp->target_hub_and_pnode[tcpu].uvhub =
+					uv_cpu_hub_info(tcpu)->numa_blade_id;
+			}
 		}
 	}
 	kfree(uvhub_descs);
@@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void)
 	spin_lock_init(&disable_lock);
 	congested_cycles = microsec_2_cycles(congested_response_us);
 
-	if (uv_init_per_cpu(nuvhubs)) {
-		nobau = 1;
-		return 0;
-	}
-
 	uv_partition_base_pnode = 0x7fffffff;
-	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
 		if (uv_blade_nr_possible_cpus(uvhub) &&
 			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
 			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+	}
+
+	if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
+		nobau = 1;
+		return 0;
+	}
 
 	vector = UV_BAU_MESSAGE;
 	for_each_possible_blade(uvhub)
 		if (uv_blade_nr_possible_cpus(uvhub))
-			uv_init_uvhub(uvhub, vector);
+			uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
 
 	uv_enable_timeouts();
 	alloc_intr_gate(vector, uv_bau_message_intr1);
-- 
cgit 


From d9a5ac9ef306eb5cc874f285185a15c303c50009 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 13 May 2011 15:52:09 +0200
Subject: x86, mce, AMD: Fix leaving freed data in a list

b may be added to a list, but is not removed before being freed
in the case of an error.  This is done in the corresponding
deallocation function, so the code here has been changed to
follow that.

The sematic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression E,E1,E2;
identifier l;
@@

*list_add(&E->l,E1);
... when != E1
    when != list_del(&E->l)
    when != list_del_init(&E->l)
    when != E = E2
*kfree(E);// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/1305294731-12127-1-git-send-email-julia@diku.dk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 167f97b5596e..bb0adad35143 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -509,6 +509,7 @@ recurse:
 out_free:
 	if (b) {
 		kobject_put(&b->kobj);
+		list_del(&b->miscj);
 		kfree(b);
 	}
 	return err;
-- 
cgit 


From f550806a7fbca06b487238442546aceb7ecbb0c9 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 15 Feb 2011 22:34:49 -0800
Subject: alpha: convert to clocksource_register_hz

Converts alpha to use clocksource_register_hz.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
CC: Richard Henderson <rth@twiddle.net>
CC: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/kernel/time.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 918e8e0b72ff..818e74ed45dc 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -375,8 +375,7 @@ static struct clocksource clocksource_rpcc = {
 
 static inline void register_rpcc_clocksource(long cycle_freq)
 {
-	clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4);
-	clocksource_register(&clocksource_rpcc);
+	clocksource_register_hz(&clocksource_rpcc, cycle_freq);
 }
 #else /* !CONFIG_SMP */
 static inline void register_rpcc_clocksource(long cycle_freq)
-- 
cgit 


From 90b57f35164aa715dcc7d939a88780a23231f84e Mon Sep 17 00:00:00 2001
From: Michael Cree <mcree@orcon.net.nz>
Date: Wed, 4 May 2011 08:14:50 +0000
Subject: alpha: Wire up syscalls new to 2.6.39

Wire up the syscalls:
   name_to_handle_at
   open_by_handle_at
   clock_adjtime
   syncfs
and adjust some whitespace in the neighbourhood to align commments.

Signed-off-by: Michael Cree <mcree@orcon.net.nz>
Signed-off-by: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/include/asm/unistd.h |  6 +++++-
 arch/alpha/kernel/systbls.S     | 12 ++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index 058937bf5a77..b1834166922d 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -452,10 +452,14 @@
 #define __NR_fanotify_init		494
 #define __NR_fanotify_mark		495
 #define __NR_prlimit64			496
+#define __NR_name_to_handle_at		497
+#define __NR_open_by_handle_at		498
+#define __NR_clock_adjtime		499
+#define __NR_syncfs			500
 
 #ifdef __KERNEL__
 
-#define NR_SYSCALLS			497
+#define NR_SYSCALLS			501
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index a6a1de9db16f..15f999d41c75 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -498,23 +498,27 @@ sys_call_table:
 	.quad sys_ni_syscall			/* sys_timerfd */
 	.quad sys_eventfd
 	.quad sys_recvmmsg
-	.quad sys_fallocate				/* 480 */
+	.quad sys_fallocate			/* 480 */
 	.quad sys_timerfd_create
 	.quad sys_timerfd_settime
 	.quad sys_timerfd_gettime
 	.quad sys_signalfd4
-	.quad sys_eventfd2				/* 485 */
+	.quad sys_eventfd2			/* 485 */
 	.quad sys_epoll_create1
 	.quad sys_dup3
 	.quad sys_pipe2
 	.quad sys_inotify_init1
-	.quad sys_preadv				/* 490 */
+	.quad sys_preadv			/* 490 */
 	.quad sys_pwritev
 	.quad sys_rt_tgsigqueueinfo
 	.quad sys_perf_event_open
 	.quad sys_fanotify_init
-	.quad sys_fanotify_mark				/* 495 */
+	.quad sys_fanotify_mark			/* 495 */
 	.quad sys_prlimit64
+	.quad sys_name_to_handle_at
+	.quad sys_open_by_handle_at
+	.quad sys_clock_adjtime
+	.quad sys_syncfs			/* 500 */
 
 	.size sys_call_table, . - sys_call_table
 	.type sys_call_table, @object
-- 
cgit 


From e503f9e4b092e2349a9477a333543de8f3c7f5d9 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Fri, 22 Apr 2011 00:22:43 +0800
Subject: x86, apic: Fix spurious error interrupts triggering on all non-boot
 APs

This patch fixes a bug reported by a customer, who found
that many unreasonable error interrupts reported on all
non-boot CPUs (APs) during the system boot stage.

According to Chapter 10 of Intel Software Developer Manual
Volume 3A, Local APIC may signal an illegal vector error when
an LVT entry is set as an illegal vector value (0~15) under
FIXED delivery mode (bits 8-11 is 0), regardless of whether
the mask bit is set or an interrupt actually happen. These
errors are seen as error interrupts.

The initial value of thermal LVT entries on all APs always reads
0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
sequence to them and LVT registers are reset to 0s except for
the mask bits which are set to 1s when APs receive INIT IPI.

When the BIOS takes over the thermal throttling interrupt,
the LVT thermal deliver mode should be SMI and it is required
from the kernel to keep AP's LVT thermal monitoring register
programmed as such as well.

This issue happens when BIOS does not take over thermal throttling
interrupt, AP's LVT thermal monitor register will be restored to
0x10000 which means vector 0 and fixed deliver mode, so all APs will
signal illegal vector error interrupts.

This patch check if interrupt delivery mode is not fixed mode before
restoring AP's LVT thermal monitor register.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Yong Wang <yong.y.wang@intel.com>
Cc: hpa@linux.intel.com
Cc: joe@perches.com
Cc: jbaron@redhat.com
Cc: trenn@suse.de
Cc: kent.liu@intel.com
Cc: chaohong.guo@intel.com
Cc: <stable@kernel.org> # As far back as possible
Link: http://lkml.kernel.org/r/1303402963-17738-1-git-send-email-youquan.song@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h           |  1 +
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index d87988bacf3e..34595d5e1038 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -78,6 +78,7 @@
 #define		APIC_DEST_LOGICAL	0x00800
 #define		APIC_DEST_PHYSICAL	0x00000
 #define		APIC_DM_FIXED		0x00000
+#define		APIC_DM_FIXED_MASK	0x00700
 #define		APIC_DM_LOWEST		0x00100
 #define		APIC_DM_SMI		0x00200
 #define		APIC_DM_REMRD		0x00300
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97f..0f034460260d 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -446,18 +446,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	 */
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 
+	h = lvtthmr_init;
 	/*
 	 * The initial value of thermal LVT entries on all APs always reads
 	 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
 	 * sequence to them and LVT registers are reset to 0s except for
 	 * the mask bits which are set to 1s when APs receive INIT IPI.
-	 * Always restore the value that BIOS has programmed on AP based on
-	 * BSP's info we saved since BIOS is always setting the same value
-	 * for all threads/cores
+	 * If BIOS takes over the thermal interrupt and sets its interrupt
+	 * delivery mode to SMI (not fixed), it restores the value that the
+	 * BIOS has programmed on AP based on BSP's info we saved since BIOS
+	 * is always setting the same value for all threads/cores.
 	 */
-	apic_write(APIC_LVTTHMR, lvtthmr_init);
+	if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+		apic_write(APIC_LVTTHMR, lvtthmr_init);
 
-	h = lvtthmr_init;
 
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG
-- 
cgit 


From 328935e6348c6a7cb34798a68c326f4b8372e68a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 17 May 2011 14:55:18 +0200
Subject: Revert "x86, AMD: Fix APIC timer erratum 400 affecting K8 Rev.A-E
 processors"

This reverts commit e20a2d205c05cef6b5783df339a7d54adeb50962, as it crashes
certain boxes with specific AMD CPU models.

Moving the lower endpoint of the Erratum 400 check to accomodate
earlier K8 revisions (A-E) opens a can of worms which is simply
not worth to fix properly by tweaking the errata checking
framework:

* missing IntPenging MSR on revisions < CG cause #GP:

http://marc.info/?l=linux-kernel&m=130541471818831

* makes earlier revisions use the LAPIC timer instead of the C1E
idle routine which switches to HPET, thus not waking up in
deeper C-states:

http://lkml.org/lkml/2011/4/24/20

Therefore, leave the original boundary starting with K8-revF.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bb9eb29a52dd..3532d3bf8105 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev);
  */
 
 const int amd_erratum_400[] =
-	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf),
+	AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
 			    AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
 EXPORT_SYMBOL_GPL(amd_erratum_400);
 
-- 
cgit 


From 14fb57dccb6e1defe9f89a66f548fcb24c374c1d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 17 May 2011 14:55:19 +0200
Subject: x86, AMD: Fix ARAT feature setting again

Trying to enable the local APIC timer on early K8 revisions
uncovers a number of other issues with it, in conjunction with
the C1E enter path on AMD. Fixing those causes much more churn
and troubles than the benefit of using that timer brings so
don't enable it on K8 at all, falling back to the original
functionality the kernel had wrt to that.

Reported-and-bisected-by: Nick Bowler <nbowler@elliptictech.com>
Cc: Boris Ostrovsky <Boris.Ostrovsky@amd.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Nick Bowler <nbowler@elliptictech.com>
Cc: Joerg-Volker-Peetz <jvpeetz@web.de>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1305636919-31165-3-git-send-email-bp@amd64.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3532d3bf8105..6f9d1f6063e9 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -613,7 +613,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 #endif
 
 	/* As a rule processors have APIC timer running in deep C states */
-	if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
+	if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400))
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
 	/*
-- 
cgit 


From b2db21997f43907f54500edaf063253ca2a186f9 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Tue, 17 May 2011 15:44:11 -0700
Subject: um: fix abort

os_dump_core() uses abort() to terminate UML in case of an fatal error.

glibc's abort() calls raise(SIGABRT) which makes use of tgkill().
tgkill() has no effect within UML's kernel threads because they are not
pthreads.  As fallback abort() executes an invalid instruction to
terminate the process.  Therefore UML gets killed by SIGSEGV and leaves a
ugly log entry in the host's kernel ring buffer.

To get rid of this we use our own abort routine.

Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/os-Linux/util.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index 6ea77979531c..42827cafa6af 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -5,6 +5,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <errno.h>
 #include <signal.h>
 #include <string.h>
@@ -75,6 +76,26 @@ void setup_hostinfo(char *buf, int len)
 		 host.release, host.version, host.machine);
 }
 
+/*
+ * We cannot use glibc's abort(). It makes use of tgkill() which
+ * has no effect within UML's kernel threads.
+ * After that glibc would execute an invalid instruction to kill
+ * the calling process and UML crashes with SIGSEGV.
+ */
+static inline void __attribute__ ((noreturn)) uml_abort(void)
+{
+	sigset_t sig;
+
+	fflush(NULL);
+
+	if (!sigemptyset(&sig) && !sigaddset(&sig, SIGABRT))
+		sigprocmask(SIG_UNBLOCK, &sig, 0);
+
+	for (;;)
+		if (kill(getpid(), SIGABRT) < 0)
+			exit(127);
+}
+
 void os_dump_core(void)
 {
 	int pid;
@@ -116,5 +137,5 @@ void os_dump_core(void)
 	while ((pid = waitpid(-1, NULL, WNOHANG | __WALL)) > 0)
 		os_kill_ptraced_process(pid, 0);
 
-	abort();
+	uml_abort();
 }
-- 
cgit 


From 3436830af53c38b7674097c00b02b7a4064476f2 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Thu, 12 May 2011 13:55:48 +0100
Subject: MIPS: RB532: Fix iomap resource size miscalculation.

This is the MIPS portion of Joe Perches <joe@perches.com>'s
https://patchwork.linux-mips.org/patch/2172/ which seems to have been
lost in time and space.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/rb532/gpio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c
index 37de05d595e7..6c47dfeb7be3 100644
--- a/arch/mips/rb532/gpio.c
+++ b/arch/mips/rb532/gpio.c
@@ -185,7 +185,7 @@ int __init rb532_gpio_init(void)
 	struct resource *r;
 
 	r = rb532_gpio_reg0_res;
-	rb532_gpio_chip->regbase = ioremap_nocache(r->start, r->end - r->start);
+	rb532_gpio_chip->regbase = ioremap_nocache(r->start, resource_size(r));
 
 	if (!rb532_gpio_chip->regbase) {
 		printk(KERN_ERR "rb532: cannot remap GPIO register 0\n");
-- 
cgit 


From 10423c91ffc8e59d4f99d401f7beb3115cdc117a Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Fri, 13 May 2011 10:33:28 +0100
Subject: MIPS: Fix duplicate invocation of notify_die.

Initial patch by Yury Polyanskiy <ypolyans@princeton.edu>.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Patchwork: https://patchwork.linux-mips.org/patch/2373/
---
 arch/mips/kernel/traps.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 71350f7f2d88..e9b3af27d844 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -374,7 +374,8 @@ void __noreturn die(const char *str, struct pt_regs *regs)
 	unsigned long dvpret = dvpe();
 #endif /* CONFIG_MIPS_MT_SMTC */
 
-	notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV);
+	if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP)
+		sig = 0;
 
 	console_verbose();
 	spin_lock_irq(&die_lock);
@@ -383,9 +384,6 @@ void __noreturn die(const char *str, struct pt_regs *regs)
 	mips_mt_regdump(dvpret);
 #endif /* CONFIG_MIPS_MT_SMTC */
 
-	if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP)
-		sig = 0;
-
 	printk("%s[#%d]:\n", str, ++die_counter);
 	show_registers(regs);
 	add_taint(TAINT_DIE);
-- 
cgit 


From 3e9957b4866f3767f19bf0e543b322ad7906c564 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Fri, 13 May 2011 17:41:21 +0200
Subject: MIPS: AR7: Fix GPIO register size for Titan variant.

The 'size' variable contains the correct register size for both AR7
and Titan, but we never used it to ioremap the correct register size.
This problem only shows up on Titan.

[ralf@linux-mips.org: Fixed the fix.  The original patch as in patchwork
recognizes the problem correctly then fails to fix it ...]

Reported-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Florian Fainelli <florian@openwrt.org>
Patchwork: https://patchwork.linux-mips.org/patch/2380/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/ar7/gpio.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/ar7/gpio.c b/arch/mips/ar7/gpio.c
index 425dfa5d6e12..bb571bcdb8f2 100644
--- a/arch/mips/ar7/gpio.c
+++ b/arch/mips/ar7/gpio.c
@@ -325,9 +325,7 @@ int __init ar7_gpio_init(void)
 		size = 0x1f;
 	}
 
-	gpch->regs = ioremap_nocache(AR7_REGS_GPIO,
-					AR7_REGS_GPIO + 0x10);
-
+	gpch->regs = ioremap_nocache(AR7_REGS_GPIO, size);
 	if (!gpch->regs) {
 		printk(KERN_ERR "%s: failed to ioremap regs\n",
 					gpch->chip.label);
-- 
cgit 


From a5602a3273774c720aaf165ff670e5b85e5910a5 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 18 May 2011 13:14:36 +0100
Subject: MIPS: Kludge IP27 build for 2.6.39.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/include/asm/dma-mapping.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 655f849bd08d..7aa37ddfca4b 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -5,7 +5,9 @@
 #include <asm/cache.h>
 #include <asm-generic/dma-coherent.h>
 
+#ifndef CONFIG_SGI_IP27	/* Kludge to fix 2.6.39 build for IP27 */
 #include <dma-coherence.h>
+#endif
 
 extern struct dma_map_ops *mips_dma_map_ops;
 
-- 
cgit 


From b1608d69cb804e414d0887140ba08a9398e4e638 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Wed, 18 May 2011 11:19:24 -0600
Subject: drivercore: revert addition of of_match to struct device

Commit b826291c, "drivercore/dt: add a match table pointer to struct
device" added an of_match pointer to struct device to cache the
of_match_table entry discovered at driver match time.  This was unsafe
because matching is not an atomic operation with probing a driver.  If
two or more drivers are attempted to be matched to a driver at the
same time, then the cached matching entry pointer could get
overwritten.

This patch reverts the of_match cache pointer and reworks all users to
call of_match_device() directly instead.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/platforms/83xx/suspend.c | 7 +++++--
 arch/powerpc/sysdev/fsl_msi.c         | 7 +++++--
 arch/sparc/kernel/pci_sabre.c         | 5 ++++-
 arch/sparc/kernel/pci_schizo.c        | 8 ++++++--
 4 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 188272934cfb..104faa8aa23c 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -318,17 +318,20 @@ static const struct platform_suspend_ops mpc83xx_suspend_ops = {
 	.end = mpc83xx_suspend_end,
 };
 
+static struct of_device_id pmc_match[];
 static int pmc_probe(struct platform_device *ofdev)
 {
+	const struct of_device_id *match;
 	struct device_node *np = ofdev->dev.of_node;
 	struct resource res;
 	struct pmc_type *type;
 	int ret = 0;
 
-	if (!ofdev->dev.of_match)
+	match = of_match_device(pmc_match, &ofdev->dev);
+	if (!match)
 		return -EINVAL;
 
-	type = ofdev->dev.of_match->data;
+	type = match->data;
 
 	if (!of_device_is_available(np))
 		return -ENODEV;
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index d5679dc1e20f..01cd2f089512 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -304,8 +304,10 @@ static int __devinit fsl_msi_setup_hwirq(struct fsl_msi *msi,
 	return 0;
 }
 
+static const struct of_device_id fsl_of_msi_ids[];
 static int __devinit fsl_of_msi_probe(struct platform_device *dev)
 {
+	const struct of_device_id *match;
 	struct fsl_msi *msi;
 	struct resource res;
 	int err, i, j, irq_index, count;
@@ -316,9 +318,10 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev)
 	u32 offset;
 	static const u32 all_avail[] = { 0, NR_MSI_IRQS };
 
-	if (!dev->dev.of_match)
+	match = of_match_device(fsl_of_msi_ids, &dev->dev);
+	if (!match)
 		return -EINVAL;
-	features = dev->dev.of_match->data;
+	features = match->data;
 
 	printk(KERN_DEBUG "Setting up Freescale MSI support\n");
 
diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c
index 948068a083fc..d1840dbdaa2f 100644
--- a/arch/sparc/kernel/pci_sabre.c
+++ b/arch/sparc/kernel/pci_sabre.c
@@ -452,8 +452,10 @@ static void __devinit sabre_pbm_init(struct pci_pbm_info *pbm,
 	sabre_scan_bus(pbm, &op->dev);
 }
 
+static const struct of_device_id sabre_match[];
 static int __devinit sabre_probe(struct platform_device *op)
 {
+	const struct of_device_id *match;
 	const struct linux_prom64_registers *pr_regs;
 	struct device_node *dp = op->dev.of_node;
 	struct pci_pbm_info *pbm;
@@ -463,7 +465,8 @@ static int __devinit sabre_probe(struct platform_device *op)
 	const u32 *vdma;
 	u64 clear_irq;
 
-	hummingbird_p = op->dev.of_match && (op->dev.of_match->data != NULL);
+	match = of_match_device(sabre_match, &op->dev);
+	hummingbird_p = match && (match->data != NULL);
 	if (!hummingbird_p) {
 		struct device_node *cpu_dp;
 
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index fecfcb2063c8..283fbc329a43 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -1458,11 +1458,15 @@ out_err:
 	return err;
 }
 
+static const struct of_device_id schizo_match[];
 static int __devinit schizo_probe(struct platform_device *op)
 {
-	if (!op->dev.of_match)
+	const struct of_device_id *match;
+
+	match = of_match_device(schizo_match, &op->dev);
+	if (!match)
 		return -EINVAL;
-	return __schizo_init(op, (unsigned long) op->dev.of_match->data);
+	return __schizo_init(op, (unsigned long)match->data);
 }
 
 /* The ordering of this table is very important.  Some Tomatillo
-- 
cgit 


From 90d231f7673e20acc4f8b5c3effb5c12098179a7 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Fri, 29 Apr 2011 11:26:22 +0200
Subject: OMAP3 cpuidle: remove useless SDP specific timings

The cpuidle states settings can be overriden by some board-
specific settings, by calling omap3_pm_init_cpuidle.
Remove the 3430SDP specific states settings registration
since the figures are identical to the default ones (in cpuidle34xx.c).

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/board-3430sdp.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/board-3430sdp.c b/arch/arm/mach-omap2/board-3430sdp.c
index 9afd087cc29c..7ffad7bfb820 100644
--- a/arch/arm/mach-omap2/board-3430sdp.c
+++ b/arch/arm/mach-omap2/board-3430sdp.c
@@ -59,24 +59,6 @@
 
 #define TWL4030_MSECURE_GPIO 22
 
-/* FIXME: These values need to be updated based on more profiling on 3430sdp*/
-static struct cpuidle_params omap3_cpuidle_params_table[] = {
-	/* C1 */
-	{1, 2, 2, 5},
-	/* C2 */
-	{1, 10, 10, 30},
-	/* C3 */
-	{1, 50, 50, 300},
-	/* C4 */
-	{1, 1500, 1800, 4000},
-	/* C5 */
-	{1, 2500, 7500, 12000},
-	/* C6 */
-	{1, 3000, 8500, 15000},
-	/* C7 */
-	{1, 10000, 30000, 300000},
-};
-
 static uint32_t board_keymap[] = {
 	KEY(0, 0, KEY_LEFT),
 	KEY(0, 1, KEY_RIGHT),
@@ -883,7 +865,6 @@ static void __init omap_3430sdp_init(void)
 	omap3_mux_init(board_mux, OMAP_PACKAGE_CBB);
 	omap_board_config = sdp3430_config;
 	omap_board_config_size = ARRAY_SIZE(sdp3430_config);
-	omap3_pm_init_cpuidle(omap3_cpuidle_params_table);
 	omap3430_i2c_init();
 	omap_display_init(&sdp3430_dss_data);
 	if (omap_rev() > OMAP3430_REV_ES1_0)
-- 
cgit 


From 866ba0ef967c693dae952afafcb1582a390a82a0 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Mon, 9 May 2011 12:02:13 +0200
Subject: OMAP3: clean-up mach specific cpuidle data structures

- sleep_latency and wake_latency are not used, replace them by
  exit_latency which is used by cpuidle. exit_latency simply is
  the sum of sleep_latency and wake_latency,
- replace threshold by target_residency,
- changed the OMAP3 specific cpuidle code accordingly,
- changed the OMAP3 board code accordingly.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/board-rx51.c  |  18 ++++---
 arch/arm/mach-omap2/cpuidle34xx.c | 103 ++++++++++++++++----------------------
 arch/arm/mach-omap2/pm.h          |  13 +++--
 3 files changed, 63 insertions(+), 71 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/board-rx51.c b/arch/arm/mach-omap2/board-rx51.c
index f8ba20a14e62..fec4cac8fa0a 100644
--- a/arch/arm/mach-omap2/board-rx51.c
+++ b/arch/arm/mach-omap2/board-rx51.c
@@ -58,21 +58,25 @@ static struct platform_device leds_gpio = {
 	},
 };
 
+/*
+ * cpuidle C-states definition override from the default values.
+ * The 'exit_latency' field is the sum of sleep and wake-up latencies.
+ */
 static struct cpuidle_params rx51_cpuidle_params[] = {
 	/* C1 */
-	{1, 110, 162, 5},
+	{110 + 162, 5 , 1},
 	/* C2 */
-	{1, 106, 180, 309},
+	{106 + 180, 309, 1},
 	/* C3 */
-	{0, 107, 410, 46057},
+	{107 + 410, 46057, 0},
 	/* C4 */
-	{0, 121, 3374, 46057},
+	{121 + 3374, 46057, 0},
 	/* C5 */
-	{1, 855, 1146, 46057},
+	{855 + 1146, 46057, 1},
 	/* C6 */
-	{0, 7580, 4134, 484329},
+	{7580 + 4134, 484329, 0},
 	/* C7 */
-	{1, 7505, 15274, 484329},
+	{7505 + 15274, 484329, 1},
 };
 
 static struct omap_lcd_config rx51_lcd_config = {
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 1c240eff3918..d7bc31a2b3af 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -52,11 +52,10 @@
 struct omap3_processor_cx {
 	u8 valid;
 	u8 type;
-	u32 sleep_latency;
-	u32 wakeup_latency;
+	u32 exit_latency;
 	u32 mpu_state;
 	u32 core_state;
-	u32 threshold;
+	u32 target_residency;
 	u32 flags;
 	const char *desc;
 };
@@ -75,19 +74,19 @@ struct powerdomain *cam_pd;
  */
 static struct cpuidle_params cpuidle_params_table[] = {
 	/* C1 */
-	{1, 2, 2, 5},
+	{2 + 2, 5, 1},
 	/* C2 */
-	{1, 10, 10, 30},
+	{10 + 10, 30, 1},
 	/* C3 */
-	{1, 50, 50, 300},
+	{50 + 50, 300, 1},
 	/* C4 */
-	{1, 1500, 1800, 4000},
+	{1500 + 1800, 4000, 1},
 	/* C5 */
-	{1, 2500, 7500, 12000},
+	{2500 + 7500, 12000, 1},
 	/* C6 */
-	{1, 3000, 8500, 15000},
+	{3000 + 8500, 15000, 1},
 	/* C7 */
-	{1, 10000, 30000, 300000},
+	{10000 + 30000, 300000, 1},
 };
 
 static int omap3_idle_bm_check(void)
@@ -330,12 +329,10 @@ void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
 		cpuidle_params_table[i].valid =
 			cpuidle_board_params[i].valid;
-		cpuidle_params_table[i].sleep_latency =
-			cpuidle_board_params[i].sleep_latency;
-		cpuidle_params_table[i].wake_latency =
-			cpuidle_board_params[i].wake_latency;
-		cpuidle_params_table[i].threshold =
-			cpuidle_board_params[i].threshold;
+		cpuidle_params_table[i].exit_latency =
+			cpuidle_board_params[i].exit_latency;
+		cpuidle_params_table[i].target_residency =
+			cpuidle_board_params[i].target_residency;
 	}
 	return;
 }
@@ -357,12 +354,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C1].valid =
 			cpuidle_params_table[OMAP3_STATE_C1].valid;
 	omap3_power_states[OMAP3_STATE_C1].type = OMAP3_STATE_C1;
-	omap3_power_states[OMAP3_STATE_C1].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C1].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C1].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C1].wake_latency;
-	omap3_power_states[OMAP3_STATE_C1].threshold =
-			cpuidle_params_table[OMAP3_STATE_C1].threshold;
+	omap3_power_states[OMAP3_STATE_C1].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C1].exit_latency;
+	omap3_power_states[OMAP3_STATE_C1].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C1].target_residency;
 	omap3_power_states[OMAP3_STATE_C1].mpu_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C1].core_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C1].flags = CPUIDLE_FLAG_TIME_VALID;
@@ -372,12 +367,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C2].valid =
 			cpuidle_params_table[OMAP3_STATE_C2].valid;
 	omap3_power_states[OMAP3_STATE_C2].type = OMAP3_STATE_C2;
-	omap3_power_states[OMAP3_STATE_C2].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C2].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C2].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C2].wake_latency;
-	omap3_power_states[OMAP3_STATE_C2].threshold =
-			cpuidle_params_table[OMAP3_STATE_C2].threshold;
+	omap3_power_states[OMAP3_STATE_C2].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C2].exit_latency;
+	omap3_power_states[OMAP3_STATE_C2].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C2].target_residency;
 	omap3_power_states[OMAP3_STATE_C2].mpu_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C2].core_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C2].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -388,12 +381,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C3].valid =
 			cpuidle_params_table[OMAP3_STATE_C3].valid;
 	omap3_power_states[OMAP3_STATE_C3].type = OMAP3_STATE_C3;
-	omap3_power_states[OMAP3_STATE_C3].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C3].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C3].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C3].wake_latency;
-	omap3_power_states[OMAP3_STATE_C3].threshold =
-			cpuidle_params_table[OMAP3_STATE_C3].threshold;
+	omap3_power_states[OMAP3_STATE_C3].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C3].exit_latency;
+	omap3_power_states[OMAP3_STATE_C3].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C3].target_residency;
 	omap3_power_states[OMAP3_STATE_C3].mpu_state = PWRDM_POWER_RET;
 	omap3_power_states[OMAP3_STATE_C3].core_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C3].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -404,12 +395,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C4].valid =
 			cpuidle_params_table[OMAP3_STATE_C4].valid;
 	omap3_power_states[OMAP3_STATE_C4].type = OMAP3_STATE_C4;
-	omap3_power_states[OMAP3_STATE_C4].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C4].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C4].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C4].wake_latency;
-	omap3_power_states[OMAP3_STATE_C4].threshold =
-			cpuidle_params_table[OMAP3_STATE_C4].threshold;
+	omap3_power_states[OMAP3_STATE_C4].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C4].exit_latency;
+	omap3_power_states[OMAP3_STATE_C4].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C4].target_residency;
 	omap3_power_states[OMAP3_STATE_C4].mpu_state = PWRDM_POWER_OFF;
 	omap3_power_states[OMAP3_STATE_C4].core_state = PWRDM_POWER_ON;
 	omap3_power_states[OMAP3_STATE_C4].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -420,12 +409,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C5].valid =
 			cpuidle_params_table[OMAP3_STATE_C5].valid;
 	omap3_power_states[OMAP3_STATE_C5].type = OMAP3_STATE_C5;
-	omap3_power_states[OMAP3_STATE_C5].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C5].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C5].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C5].wake_latency;
-	omap3_power_states[OMAP3_STATE_C5].threshold =
-			cpuidle_params_table[OMAP3_STATE_C5].threshold;
+	omap3_power_states[OMAP3_STATE_C5].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C5].exit_latency;
+	omap3_power_states[OMAP3_STATE_C5].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C5].target_residency;
 	omap3_power_states[OMAP3_STATE_C5].mpu_state = PWRDM_POWER_RET;
 	omap3_power_states[OMAP3_STATE_C5].core_state = PWRDM_POWER_RET;
 	omap3_power_states[OMAP3_STATE_C5].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -436,12 +423,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C6].valid =
 			cpuidle_params_table[OMAP3_STATE_C6].valid;
 	omap3_power_states[OMAP3_STATE_C6].type = OMAP3_STATE_C6;
-	omap3_power_states[OMAP3_STATE_C6].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C6].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C6].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C6].wake_latency;
-	omap3_power_states[OMAP3_STATE_C6].threshold =
-			cpuidle_params_table[OMAP3_STATE_C6].threshold;
+	omap3_power_states[OMAP3_STATE_C6].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C6].exit_latency;
+	omap3_power_states[OMAP3_STATE_C6].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C6].target_residency;
 	omap3_power_states[OMAP3_STATE_C6].mpu_state = PWRDM_POWER_OFF;
 	omap3_power_states[OMAP3_STATE_C6].core_state = PWRDM_POWER_RET;
 	omap3_power_states[OMAP3_STATE_C6].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -452,12 +437,10 @@ void omap_init_power_states(void)
 	omap3_power_states[OMAP3_STATE_C7].valid =
 			cpuidle_params_table[OMAP3_STATE_C7].valid;
 	omap3_power_states[OMAP3_STATE_C7].type = OMAP3_STATE_C7;
-	omap3_power_states[OMAP3_STATE_C7].sleep_latency =
-			cpuidle_params_table[OMAP3_STATE_C7].sleep_latency;
-	omap3_power_states[OMAP3_STATE_C7].wakeup_latency =
-			cpuidle_params_table[OMAP3_STATE_C7].wake_latency;
-	omap3_power_states[OMAP3_STATE_C7].threshold =
-			cpuidle_params_table[OMAP3_STATE_C7].threshold;
+	omap3_power_states[OMAP3_STATE_C7].exit_latency =
+			cpuidle_params_table[OMAP3_STATE_C7].exit_latency;
+	omap3_power_states[OMAP3_STATE_C7].target_residency =
+			cpuidle_params_table[OMAP3_STATE_C7].target_residency;
 	omap3_power_states[OMAP3_STATE_C7].mpu_state = PWRDM_POWER_OFF;
 	omap3_power_states[OMAP3_STATE_C7].core_state = PWRDM_POWER_OFF;
 	omap3_power_states[OMAP3_STATE_C7].flags = CPUIDLE_FLAG_TIME_VALID |
@@ -512,8 +495,8 @@ int __init omap3_idle_init(void)
 		if (!cx->valid)
 			continue;
 		cpuidle_set_statedata(state, cx);
-		state->exit_latency = cx->sleep_latency + cx->wakeup_latency;
-		state->target_residency = cx->threshold;
+		state->exit_latency = cx->exit_latency;
+		state->target_residency = cx->target_residency;
 		state->flags = cx->flags;
 		state->enter = (state->flags & CPUIDLE_FLAG_CHECK_BM) ?
 			omap3_enter_idle_bm : omap3_enter_idle;
diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h
index 797bfd12b643..32dbc1311efd 100644
--- a/arch/arm/mach-omap2/pm.h
+++ b/arch/arm/mach-omap2/pm.h
@@ -36,11 +36,16 @@ static inline int omap4_opp_init(void)
 }
 #endif
 
+/*
+ * cpuidle mach specific parameters
+ *
+ * The board code can override the default C-states definition using
+ * omap3_pm_init_cpuidle
+ */
 struct cpuidle_params {
-	u8  valid;
-	u32 sleep_latency;
-	u32 wake_latency;
-	u32 threshold;
+	u32 exit_latency;	/* exit_latency = sleep + wake-up latencies */
+	u32 target_residency;
+	u8 valid;		/* validates the C-state */
 };
 
 #if defined(CONFIG_PM) && defined(CONFIG_CPU_IDLE)
-- 
cgit 


From badc303a862ba6c5fd3d324f5332db07877f8159 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Mon, 9 May 2011 12:02:14 +0200
Subject: OMAP3: cpuidle: re-organize the C-states data

The current implementation defines an internal structure and a
C-states array. Using those structures is redundant to the
structs used by the cpuidle framework.

This patch provides a clean-up of the internal struct, removes the
internal C-states array, stores the data using the existing cpuidle
per C-state struct and registers the mach specific data to cpuidle
C-state driver_data (accessed using cpuidle_[gs]et_statedata).
Also removes unused macros, fields and code and compacts the repeating
code using an inline helper function.

The result is more compact and more readable code as well as
reduced data RAM usage.

Also retain C1 as the only always valid C-state and system safe state.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/cpuidle34xx.c | 305 +++++++++++++-------------------------
 1 file changed, 101 insertions(+), 204 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index d7bc31a2b3af..f9c8676b1f4c 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -36,35 +36,6 @@
 
 #ifdef CONFIG_CPU_IDLE
 
-#define OMAP3_MAX_STATES 7
-#define OMAP3_STATE_C1 0 /* C1 - MPU WFI + Core active */
-#define OMAP3_STATE_C2 1 /* C2 - MPU WFI + Core inactive */
-#define OMAP3_STATE_C3 2 /* C3 - MPU CSWR + Core inactive */
-#define OMAP3_STATE_C4 3 /* C4 - MPU OFF + Core iactive */
-#define OMAP3_STATE_C5 4 /* C5 - MPU RET + Core RET */
-#define OMAP3_STATE_C6 5 /* C6 - MPU OFF + Core RET */
-#define OMAP3_STATE_C7 6 /* C7 - MPU OFF + Core OFF */
-
-#define OMAP3_STATE_MAX OMAP3_STATE_C7
-
-#define CPUIDLE_FLAG_CHECK_BM	0x10000	/* use omap3_enter_idle_bm() */
-
-struct omap3_processor_cx {
-	u8 valid;
-	u8 type;
-	u32 exit_latency;
-	u32 mpu_state;
-	u32 core_state;
-	u32 target_residency;
-	u32 flags;
-	const char *desc;
-};
-
-struct omap3_processor_cx omap3_power_states[OMAP3_MAX_STATES];
-struct omap3_processor_cx current_cx_state;
-struct powerdomain *mpu_pd, *core_pd, *per_pd;
-struct powerdomain *cam_pd;
-
 /*
  * The latencies/thresholds for various C states have
  * to be configured from the respective board files.
@@ -88,6 +59,17 @@ static struct cpuidle_params cpuidle_params_table[] = {
 	/* C7 */
 	{10000 + 30000, 300000, 1},
 };
+#define OMAP3_NUM_STATES ARRAY_SIZE(cpuidle_params_table)
+
+/* Mach specific information to be recorded in the C-state driver_data */
+struct omap3_idle_statedata {
+	u32 mpu_state;
+	u32 core_state;
+	u8 valid;
+};
+struct omap3_idle_statedata omap3_idle_data[OMAP3_NUM_STATES];
+
+struct powerdomain *mpu_pd, *core_pd, *per_pd, *cam_pd;
 
 static int omap3_idle_bm_check(void)
 {
@@ -121,12 +103,10 @@ static int _cpuidle_deny_idle(struct powerdomain *pwrdm,
 static int omap3_enter_idle(struct cpuidle_device *dev,
 			struct cpuidle_state *state)
 {
-	struct omap3_processor_cx *cx = cpuidle_get_statedata(state);
+	struct omap3_idle_statedata *cx = cpuidle_get_statedata(state);
 	struct timespec ts_preidle, ts_postidle, ts_idle;
 	u32 mpu_state = cx->mpu_state, core_state = cx->core_state;
 
-	current_cx_state = *cx;
-
 	/* Used to keep track of the total time in idle */
 	getnstimeofday(&ts_preidle);
 
@@ -139,7 +119,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 	if (omap_irq_pending() || need_resched())
 		goto return_sleep_time;
 
-	if (cx->type == OMAP3_STATE_C1) {
+	/* Deny idle for C1 */
+	if (state == &dev->states[0]) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_deny_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_deny_idle);
 	}
@@ -147,7 +128,8 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 	/* Execute ARM wfi */
 	omap_sram_idle();
 
-	if (cx->type == OMAP3_STATE_C1) {
+	/* Re-allow idle for C1 */
+	if (state == &dev->states[0]) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_allow_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_allow_idle);
 	}
@@ -169,26 +151,26 @@ return_sleep_time:
  *
  * If the current state is valid, it is returned back to the caller.
  * Else, this function searches for a lower c-state which is still
- * valid (as defined in omap3_power_states[]).
+ * valid.
  */
 static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
-						struct cpuidle_state *curr)
+					      struct cpuidle_state *curr)
 {
 	struct cpuidle_state *next = NULL;
-	struct omap3_processor_cx *cx;
+	struct omap3_idle_statedata *cx;
 
-	cx = (struct omap3_processor_cx *)cpuidle_get_statedata(curr);
+	cx = cpuidle_get_statedata(curr);
 
 	/* Check if current state is valid */
 	if (cx->valid) {
 		return curr;
 	} else {
-		u8 idx = OMAP3_STATE_MAX;
+		int idx = OMAP3_NUM_STATES - 1;
 
 		/*
 		 * Reach the current state starting at highest C-state
 		 */
-		for (; idx >= OMAP3_STATE_C1; idx--) {
+		for (; idx >= 0; idx--) {
 			if (&dev->states[idx] == curr) {
 				next = &dev->states[idx];
 				break;
@@ -205,9 +187,7 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 		 * Start search from the next (lower) state.
 		 */
 		idx--;
-		for (; idx >= OMAP3_STATE_C1; idx--) {
-			struct omap3_processor_cx *cx;
-
+		for (; idx >= 0; idx--) {
 			cx = cpuidle_get_statedata(&dev->states[idx]);
 			if (cx->valid) {
 				next = &dev->states[idx];
@@ -215,7 +195,7 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 			}
 		}
 		/*
-		 * C1 and C2 are always valid.
+		 * C1 is always valid.
 		 * So, no need to check for 'next==NULL' outside this loop.
 		 */
 	}
@@ -228,9 +208,8 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
  * @dev: cpuidle device
  * @state: The target state to be programmed
  *
- * Used for C states with CPUIDLE_FLAG_CHECK_BM flag set. This
- * function checks for any pending activity and then programs the
- * device to the specified or a safer state.
+ * This function checks for any pending activity and then programs
+ * the device to the specified or a safer state.
  */
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 			       struct cpuidle_state *state)
@@ -238,10 +217,10 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	struct cpuidle_state *new_state = next_valid_state(dev, state);
 	u32 core_next_state, per_next_state = 0, per_saved_state = 0;
 	u32 cam_state;
-	struct omap3_processor_cx *cx;
+	struct omap3_idle_statedata *cx;
 	int ret;
 
-	if ((state->flags & CPUIDLE_FLAG_CHECK_BM) && omap3_idle_bm_check()) {
+	if (omap3_idle_bm_check()) {
 		BUG_ON(!dev->safe_state);
 		new_state = dev->safe_state;
 		goto select_state;
@@ -307,8 +286,8 @@ void omap3_cpuidle_update_states(u32 mpu_deepest_state, u32 core_deepest_state)
 {
 	int i;
 
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		struct omap3_processor_cx *cx = &omap3_power_states[i];
+	for (i = 0; i < OMAP3_NUM_STATES; i++) {
+		struct omap3_idle_statedata *cx = &omap3_idle_data[i];
 
 		if ((cx->mpu_state >= mpu_deepest_state) &&
 		    (cx->core_state >= core_deepest_state)) {
@@ -326,9 +305,8 @@ void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 	if (!cpuidle_board_params)
 		return;
 
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		cpuidle_params_table[i].valid =
-			cpuidle_board_params[i].valid;
+	for (i = 0; i < OMAP3_NUM_STATES; i++) {
+		cpuidle_params_table[i].valid =	cpuidle_board_params[i].valid;
 		cpuidle_params_table[i].exit_latency =
 			cpuidle_board_params[i].exit_latency;
 		cpuidle_params_table[i].target_residency =
@@ -337,185 +315,104 @@ void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 	return;
 }
 
-/* omap3_init_power_states - Initialises the OMAP3 specific C states.
- *
- * Below is the desciption of each C state.
- * 	C1 . MPU WFI + Core active
- *	C2 . MPU WFI + Core inactive
- *	C3 . MPU CSWR + Core inactive
- *	C4 . MPU OFF + Core inactive
- *	C5 . MPU CSWR + Core CSWR
- *	C6 . MPU OFF + Core CSWR
- *	C7 . MPU OFF + Core OFF
- */
-void omap_init_power_states(void)
-{
-	/* C1 . MPU WFI + Core active */
-	omap3_power_states[OMAP3_STATE_C1].valid =
-			cpuidle_params_table[OMAP3_STATE_C1].valid;
-	omap3_power_states[OMAP3_STATE_C1].type = OMAP3_STATE_C1;
-	omap3_power_states[OMAP3_STATE_C1].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C1].exit_latency;
-	omap3_power_states[OMAP3_STATE_C1].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C1].target_residency;
-	omap3_power_states[OMAP3_STATE_C1].mpu_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C1].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C1].flags = CPUIDLE_FLAG_TIME_VALID;
-	omap3_power_states[OMAP3_STATE_C1].desc = "MPU ON + CORE ON";
-
-	/* C2 . MPU WFI + Core inactive */
-	omap3_power_states[OMAP3_STATE_C2].valid =
-			cpuidle_params_table[OMAP3_STATE_C2].valid;
-	omap3_power_states[OMAP3_STATE_C2].type = OMAP3_STATE_C2;
-	omap3_power_states[OMAP3_STATE_C2].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C2].exit_latency;
-	omap3_power_states[OMAP3_STATE_C2].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C2].target_residency;
-	omap3_power_states[OMAP3_STATE_C2].mpu_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C2].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C2].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C2].desc = "MPU ON + CORE ON";
-
-	/* C3 . MPU CSWR + Core inactive */
-	omap3_power_states[OMAP3_STATE_C3].valid =
-			cpuidle_params_table[OMAP3_STATE_C3].valid;
-	omap3_power_states[OMAP3_STATE_C3].type = OMAP3_STATE_C3;
-	omap3_power_states[OMAP3_STATE_C3].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C3].exit_latency;
-	omap3_power_states[OMAP3_STATE_C3].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C3].target_residency;
-	omap3_power_states[OMAP3_STATE_C3].mpu_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C3].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C3].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C3].desc = "MPU RET + CORE ON";
-
-	/* C4 . MPU OFF + Core inactive */
-	omap3_power_states[OMAP3_STATE_C4].valid =
-			cpuidle_params_table[OMAP3_STATE_C4].valid;
-	omap3_power_states[OMAP3_STATE_C4].type = OMAP3_STATE_C4;
-	omap3_power_states[OMAP3_STATE_C4].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C4].exit_latency;
-	omap3_power_states[OMAP3_STATE_C4].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C4].target_residency;
-	omap3_power_states[OMAP3_STATE_C4].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C4].core_state = PWRDM_POWER_ON;
-	omap3_power_states[OMAP3_STATE_C4].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C4].desc = "MPU OFF + CORE ON";
-
-	/* C5 . MPU CSWR + Core CSWR*/
-	omap3_power_states[OMAP3_STATE_C5].valid =
-			cpuidle_params_table[OMAP3_STATE_C5].valid;
-	omap3_power_states[OMAP3_STATE_C5].type = OMAP3_STATE_C5;
-	omap3_power_states[OMAP3_STATE_C5].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C5].exit_latency;
-	omap3_power_states[OMAP3_STATE_C5].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C5].target_residency;
-	omap3_power_states[OMAP3_STATE_C5].mpu_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C5].core_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C5].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C5].desc = "MPU RET + CORE RET";
-
-	/* C6 . MPU OFF + Core CSWR */
-	omap3_power_states[OMAP3_STATE_C6].valid =
-			cpuidle_params_table[OMAP3_STATE_C6].valid;
-	omap3_power_states[OMAP3_STATE_C6].type = OMAP3_STATE_C6;
-	omap3_power_states[OMAP3_STATE_C6].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C6].exit_latency;
-	omap3_power_states[OMAP3_STATE_C6].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C6].target_residency;
-	omap3_power_states[OMAP3_STATE_C6].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C6].core_state = PWRDM_POWER_RET;
-	omap3_power_states[OMAP3_STATE_C6].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C6].desc = "MPU OFF + CORE RET";
-
-	/* C7 . MPU OFF + Core OFF */
-	omap3_power_states[OMAP3_STATE_C7].valid =
-			cpuidle_params_table[OMAP3_STATE_C7].valid;
-	omap3_power_states[OMAP3_STATE_C7].type = OMAP3_STATE_C7;
-	omap3_power_states[OMAP3_STATE_C7].exit_latency =
-			cpuidle_params_table[OMAP3_STATE_C7].exit_latency;
-	omap3_power_states[OMAP3_STATE_C7].target_residency =
-			cpuidle_params_table[OMAP3_STATE_C7].target_residency;
-	omap3_power_states[OMAP3_STATE_C7].mpu_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C7].core_state = PWRDM_POWER_OFF;
-	omap3_power_states[OMAP3_STATE_C7].flags = CPUIDLE_FLAG_TIME_VALID |
-				CPUIDLE_FLAG_CHECK_BM;
-	omap3_power_states[OMAP3_STATE_C7].desc = "MPU OFF + CORE OFF";
-
-	/*
-	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
-	 * enable OFF mode in a stable form for previous revisions.
-	 * we disable C7 state as a result.
-	 */
-	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) {
-		omap3_power_states[OMAP3_STATE_C7].valid = 0;
-		cpuidle_params_table[OMAP3_STATE_C7].valid = 0;
-		pr_warn("%s: core off state C7 disabled due to i583\n",
-				__func__);
-	}
-}
-
 struct cpuidle_driver omap3_idle_driver = {
 	.name = 	"omap3_idle",
 	.owner = 	THIS_MODULE,
 };
 
+/* Fill in the state data from the mach tables and register the driver_data */
+static inline struct omap3_idle_statedata *_fill_cstate(
+					struct cpuidle_device *dev,
+					int idx, const char *descr)
+{
+	struct omap3_idle_statedata *cx = &omap3_idle_data[idx];
+	struct cpuidle_state *state = &dev->states[idx];
+
+	state->exit_latency	= cpuidle_params_table[idx].exit_latency;
+	state->target_residency	= cpuidle_params_table[idx].target_residency;
+	state->flags		= CPUIDLE_FLAG_TIME_VALID;
+	state->enter		= omap3_enter_idle_bm;
+	cx->valid		= cpuidle_params_table[idx].valid;
+	sprintf(state->name, "C%d", idx + 1);
+	strncpy(state->desc, descr, CPUIDLE_DESC_LEN);
+	cpuidle_set_statedata(state, cx);
+
+	return cx;
+}
+
 /**
  * omap3_idle_init - Init routine for OMAP3 idle
  *
- * Registers the OMAP3 specific cpuidle driver with the cpuidle
+ * Registers the OMAP3 specific cpuidle driver to the cpuidle
  * framework with the valid set of states.
  */
 int __init omap3_idle_init(void)
 {
-	int i, count = 0;
-	struct omap3_processor_cx *cx;
-	struct cpuidle_state *state;
 	struct cpuidle_device *dev;
+	struct omap3_idle_statedata *cx;
 
 	mpu_pd = pwrdm_lookup("mpu_pwrdm");
 	core_pd = pwrdm_lookup("core_pwrdm");
 	per_pd = pwrdm_lookup("per_pwrdm");
 	cam_pd = pwrdm_lookup("cam_pwrdm");
 
-	omap_init_power_states();
 	cpuidle_register_driver(&omap3_idle_driver);
-
 	dev = &per_cpu(omap3_idle_dev, smp_processor_id());
 
-	for (i = OMAP3_STATE_C1; i < OMAP3_MAX_STATES; i++) {
-		cx = &omap3_power_states[i];
-		state = &dev->states[count];
-
-		if (!cx->valid)
-			continue;
-		cpuidle_set_statedata(state, cx);
-		state->exit_latency = cx->exit_latency;
-		state->target_residency = cx->target_residency;
-		state->flags = cx->flags;
-		state->enter = (state->flags & CPUIDLE_FLAG_CHECK_BM) ?
-			omap3_enter_idle_bm : omap3_enter_idle;
-		if (cx->type == OMAP3_STATE_C1)
-			dev->safe_state = state;
-		sprintf(state->name, "C%d", count+1);
-		strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
-		count++;
-	}
+	/* C1 . MPU WFI + Core active */
+	cx = _fill_cstate(dev, 0, "MPU ON + CORE ON");
+	(&dev->states[0])->enter = omap3_enter_idle;
+	dev->safe_state = &dev->states[0];
+	cx->valid = 1;	/* C1 is always valid */
+	cx->mpu_state = PWRDM_POWER_ON;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C2 . MPU WFI + Core inactive */
+	cx = _fill_cstate(dev, 1, "MPU ON + CORE ON");
+	cx->mpu_state = PWRDM_POWER_ON;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C3 . MPU CSWR + Core inactive */
+	cx = _fill_cstate(dev, 2, "MPU RET + CORE ON");
+	cx->mpu_state = PWRDM_POWER_RET;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C4 . MPU OFF + Core inactive */
+	cx = _fill_cstate(dev, 3, "MPU OFF + CORE ON");
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_ON;
+
+	/* C5 . MPU RET + Core RET */
+	cx = _fill_cstate(dev, 4, "MPU RET + CORE RET");
+	cx->mpu_state = PWRDM_POWER_RET;
+	cx->core_state = PWRDM_POWER_RET;
 
-	if (!count)
-		return -EINVAL;
-	dev->state_count = count;
+	/* C6 . MPU OFF + Core RET */
+	cx = _fill_cstate(dev, 5, "MPU OFF + CORE RET");
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_RET;
+
+	/* C7 . MPU OFF + Core OFF */
+	cx = _fill_cstate(dev, 6, "MPU OFF + CORE OFF");
+	/*
+	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
+	 * enable OFF mode in a stable form for previous revisions.
+	 * We disable C7 state as a result.
+	 */
+	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583)) {
+		cx->valid = 0;
+		pr_warn("%s: core off state C7 disabled due to i583\n",
+			__func__);
+	}
+	cx->mpu_state = PWRDM_POWER_OFF;
+	cx->core_state = PWRDM_POWER_OFF;
 
 	if (enable_off_mode)
 		omap3_cpuidle_update_states(PWRDM_POWER_OFF, PWRDM_POWER_OFF);
 	else
 		omap3_cpuidle_update_states(PWRDM_POWER_RET, PWRDM_POWER_RET);
 
+	dev->state_count = OMAP3_NUM_STATES;
 	if (cpuidle_register_device(dev)) {
 		printk(KERN_ERR "%s: CPUidle register device failed\n",
 		       __func__);
-- 
cgit 


From c6cd91de1cb4694f2dfcc7df831e276fffdffffc Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Mon, 9 May 2011 12:02:15 +0200
Subject: OMAP3: cpuidle: code rework for improved readability

- fix single and multi-lines comments format
- removed the omap3_idle_bm_check function and replaced the test
   in omap3_enter_idle_bm by the equivalent code
- re-organize omap3_enter_idle_bm code path, assign local variables
   only when needed
- reword some comments

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/cpuidle34xx.c | 52 ++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index f9c8676b1f4c..dd31e539274c 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -71,13 +71,6 @@ struct omap3_idle_statedata omap3_idle_data[OMAP3_NUM_STATES];
 
 struct powerdomain *mpu_pd, *core_pd, *per_pd, *cam_pd;
 
-static int omap3_idle_bm_check(void)
-{
-	if (!omap3_can_sleep())
-		return 1;
-	return 0;
-}
-
 static int _cpuidle_allow_idle(struct powerdomain *pwrdm,
 				struct clockdomain *clkdm)
 {
@@ -157,9 +150,7 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 					      struct cpuidle_state *curr)
 {
 	struct cpuidle_state *next = NULL;
-	struct omap3_idle_statedata *cx;
-
-	cx = cpuidle_get_statedata(curr);
+	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr);
 
 	/* Check if current state is valid */
 	if (cx->valid) {
@@ -167,9 +158,7 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 	} else {
 		int idx = OMAP3_NUM_STATES - 1;
 
-		/*
-		 * Reach the current state starting at highest C-state
-		 */
+		/* Reach the current state starting at highest C-state */
 		for (; idx >= 0; idx--) {
 			if (&dev->states[idx] == curr) {
 				next = &dev->states[idx];
@@ -177,9 +166,7 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 			}
 		}
 
-		/*
-		 * Should never hit this condition.
-		 */
+		/* Should never hit this condition */
 		WARN_ON(next == NULL);
 
 		/*
@@ -214,29 +201,16 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 			       struct cpuidle_state *state)
 {
-	struct cpuidle_state *new_state = next_valid_state(dev, state);
-	u32 core_next_state, per_next_state = 0, per_saved_state = 0;
-	u32 cam_state;
+	struct cpuidle_state *new_state;
+	u32 core_next_state, per_next_state = 0, per_saved_state = 0, cam_state;
 	struct omap3_idle_statedata *cx;
 	int ret;
 
-	if (omap3_idle_bm_check()) {
-		BUG_ON(!dev->safe_state);
+	if (!omap3_can_sleep()) {
 		new_state = dev->safe_state;
 		goto select_state;
 	}
 
-	cx = cpuidle_get_statedata(state);
-	core_next_state = cx->core_state;
-
-	/*
-	 * FIXME: we currently manage device-specific idle states
-	 *        for PER and CORE in combination with CPU-specific
-	 *        idle states.  This is wrong, and device-specific
-	 *        idle management needs to be separated out into 
-	 *        its own code.
-	 */
-
 	/*
 	 * Prevent idle completely if CAM is active.
 	 * CAM does not have wakeup capability in OMAP3.
@@ -247,10 +221,20 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 		goto select_state;
 	}
 
+	/*
+	 * FIXME: we currently manage device-specific idle states
+	 *        for PER and CORE in combination with CPU-specific
+	 *        idle states.  This is wrong, and device-specific
+	 *        idle management needs to be separated out into
+	 *        its own code.
+	 */
+
 	/*
 	 * Prevent PER off if CORE is not in retention or off as this
 	 * would disable PER wakeups completely.
 	 */
+	cx = cpuidle_get_statedata(state);
+	core_next_state = cx->core_state;
 	per_next_state = per_saved_state = pwrdm_read_next_pwrst(per_pd);
 	if ((per_next_state == PWRDM_POWER_OFF) &&
 	    (core_next_state > PWRDM_POWER_RET))
@@ -260,6 +244,8 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	if (per_next_state != per_saved_state)
 		pwrdm_set_next_pwrst(per_pd, per_next_state);
 
+	new_state = next_valid_state(dev, state);
+
 select_state:
 	dev->last_state = new_state;
 	ret = omap3_enter_idle(dev, new_state);
@@ -320,7 +306,7 @@ struct cpuidle_driver omap3_idle_driver = {
 	.owner = 	THIS_MODULE,
 };
 
-/* Fill in the state data from the mach tables and register the driver_data */
+/* Helper to fill the C-state common data and register the driver_data */
 static inline struct omap3_idle_statedata *_fill_cstate(
 					struct cpuidle_device *dev,
 					int idx, const char *descr)
-- 
cgit 


From 04908918191f7926ec7af99890fb4ddb3b769c13 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Mon, 9 May 2011 12:02:16 +0200
Subject: OMAP3: cpuidle: change the power domains modes determination logic

The achievable power modes of the power domains in cpuidle
depends on the system wide 'enable_off_mode' knob in debugfs.
Upon changing enable_off_mode, do not change the C-states
'valid' field but instead dynamically restrict the power modes
when entering idle.

The C-states 'valid' field is just used to enable/disable some
C-states at init and shall not be changed later on.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/cpuidle34xx.c | 58 ++++++++++++++++-----------------------
 arch/arm/mach-omap2/pm.h          |  4 ---
 arch/arm/mach-omap2/pm34xx.c      | 12 --------
 3 files changed, 24 insertions(+), 50 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index dd31e539274c..4bf6e6e8b100 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -138,22 +138,40 @@ return_sleep_time:
 }
 
 /**
- * next_valid_state - Find next valid c-state
+ * next_valid_state - Find next valid C-state
  * @dev: cpuidle device
- * @state: Currently selected c-state
+ * @state: Currently selected C-state
  *
  * If the current state is valid, it is returned back to the caller.
  * Else, this function searches for a lower c-state which is still
  * valid.
+ *
+ * A state is valid if the 'valid' field is enabled and
+ * if it satisfies the enable_off_mode condition.
  */
 static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 					      struct cpuidle_state *curr)
 {
 	struct cpuidle_state *next = NULL;
 	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr);
+	u32 mpu_deepest_state = PWRDM_POWER_RET;
+	u32 core_deepest_state = PWRDM_POWER_RET;
+
+	if (enable_off_mode) {
+		mpu_deepest_state = PWRDM_POWER_OFF;
+		/*
+		 * Erratum i583: valable for ES rev < Es1.2 on 3630.
+		 * CORE OFF mode is not supported in a stable form, restrict
+		 * instead the CORE state to RET.
+		 */
+		if (!IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583))
+			core_deepest_state = PWRDM_POWER_OFF;
+	}
 
 	/* Check if current state is valid */
-	if (cx->valid) {
+	if ((cx->valid) &&
+	    (cx->mpu_state >= mpu_deepest_state) &&
+	    (cx->core_state >= core_deepest_state)) {
 		return curr;
 	} else {
 		int idx = OMAP3_NUM_STATES - 1;
@@ -176,7 +194,9 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 		idx--;
 		for (; idx >= 0; idx--) {
 			cx = cpuidle_get_statedata(&dev->states[idx]);
-			if (cx->valid) {
+			if ((cx->valid) &&
+			    (cx->mpu_state >= mpu_deepest_state) &&
+			    (cx->core_state >= core_deepest_state)) {
 				next = &dev->states[idx];
 				break;
 			}
@@ -259,31 +279,6 @@ select_state:
 
 DEFINE_PER_CPU(struct cpuidle_device, omap3_idle_dev);
 
-/**
- * omap3_cpuidle_update_states() - Update the cpuidle states
- * @mpu_deepest_state:	Enable states up to and including this for mpu domain
- * @core_deepest_state:	Enable states up to and including this for core domain
- *
- * This goes through the list of states available and enables and disables the
- * validity of C states based on deepest state that can be achieved for the
- * variable domain
- */
-void omap3_cpuidle_update_states(u32 mpu_deepest_state, u32 core_deepest_state)
-{
-	int i;
-
-	for (i = 0; i < OMAP3_NUM_STATES; i++) {
-		struct omap3_idle_statedata *cx = &omap3_idle_data[i];
-
-		if ((cx->mpu_state >= mpu_deepest_state) &&
-		    (cx->core_state >= core_deepest_state)) {
-			cx->valid = 1;
-		} else {
-			cx->valid = 0;
-		}
-	}
-}
-
 void omap3_pm_init_cpuidle(struct cpuidle_params *cpuidle_board_params)
 {
 	int i;
@@ -393,11 +388,6 @@ int __init omap3_idle_init(void)
 	cx->mpu_state = PWRDM_POWER_OFF;
 	cx->core_state = PWRDM_POWER_OFF;
 
-	if (enable_off_mode)
-		omap3_cpuidle_update_states(PWRDM_POWER_OFF, PWRDM_POWER_OFF);
-	else
-		omap3_cpuidle_update_states(PWRDM_POWER_RET, PWRDM_POWER_RET);
-
 	dev->state_count = OMAP3_NUM_STATES;
 	if (cpuidle_register_device(dev)) {
 		printk(KERN_ERR "%s: CPUidle register device failed\n",
diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h
index 32dbc1311efd..45bcfce77352 100644
--- a/arch/arm/mach-omap2/pm.h
+++ b/arch/arm/mach-omap2/pm.h
@@ -78,10 +78,6 @@ extern u32 sleep_while_idle;
 #define sleep_while_idle 0
 #endif
 
-#if defined(CONFIG_CPU_IDLE)
-extern void omap3_cpuidle_update_states(u32, u32);
-#endif
-
 #if defined(CONFIG_PM_DEBUG) && defined(CONFIG_DEBUG_FS)
 extern void pm_dbg_update_time(struct powerdomain *pwrdm, int prev);
 extern int pm_dbg_regset_save(int reg_set);
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 0c5e3a46a3ad..caf9f6cb3908 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -779,18 +779,6 @@ void omap3_pm_off_mode_enable(int enable)
 	else
 		state = PWRDM_POWER_RET;
 
-#ifdef CONFIG_CPU_IDLE
-	/*
-	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
-	 * enable OFF mode in a stable form for previous revisions, restrict
-	 * instead to RET
-	 */
-	if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583))
-		omap3_cpuidle_update_states(state, PWRDM_POWER_RET);
-	else
-		omap3_cpuidle_update_states(state, state);
-#endif
-
 	list_for_each_entry(pwrst, &pwrst_list, node) {
 		if (IS_PM34XX_ERRATUM(PM_SDRC_WAKEUP_ERRATUM_i583) &&
 				pwrst->pwrdm == core_pwrdm &&
-- 
cgit 


From 99aa18278e867574d72201b806f82ace07d4804b Mon Sep 17 00:00:00 2001
From: Sanjeev Premi <premi@ti.com>
Date: Wed, 18 May 2011 14:44:40 +0530
Subject: OMAP3: PM: Boot message is not an error, and not helpful, remove it

It shows up on the console despite using "silent" in the bootargs, and
it's really just noise in the boot log since PM init is always called.

Signed-off-by: Sanjeev Premi <premi@ti.com>
Cc: jhnikula@gmail.com
[khilman@ti.com: minor changelog edits]
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/pm34xx.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index caf9f6cb3908..c155c9d1c82c 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -883,8 +883,6 @@ static int __init omap3_pm_init(void)
 
 	pm_errata_configure();
 
-	printk(KERN_ERR "Power Management for TI OMAP3.\n");
-
 	/* XXX prcm_setup_regs needs to be before enabling hw
 	 * supervised mode for powerdomains */
 	prcm_setup_regs();
-- 
cgit