diff --git a/Documentation/devicetree/bindings/arm/l2cc.txt b/Documentation/devicetree/bindings/arm/l2cc.txt
index 0dbabe9a6b0a..2251dccb141e 100644
--- a/Documentation/devicetree/bindings/arm/l2cc.txt
+++ b/Documentation/devicetree/bindings/arm/l2cc.txt
@@ -67,6 +67,11 @@ Optional properties:
   disable if zero.
 - arm,prefetch-offset : Override prefetch offset value. Valid values are
   0-7, 15, 23, and 31.
+- prefetch-data : Data prefetch. Value: <0> (forcibly disable), <1>
+  (forcibly enable), property absent (retain settings set by firmware)
+- prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable),
+  <1> (forcibly enable), property absent (retain settings set by
+  firmware)
 
 Example:
 
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index e49e423268c0..04d34f6a58f3 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -856,6 +856,10 @@ address which can extend beyond that limit.
   name may clash with standard defined ones, you prefix them with your
   vendor name and a comma.
 
+  Additional properties for the root node:
+
+    - serial-number : a string representing the device's serial number
+
   b) The /cpus node
 
   This node is the parent of all individual CPU nodes. It doesn't
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index cf292d3ec27f..a750c1425c3a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -33,8 +33,8 @@ config ARM
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
-	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
-	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
+	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
 	select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_BPF_JIT
@@ -45,7 +45,7 @@ config ARM
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_CONTIGUOUS if MMU
-	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
+	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
@@ -59,10 +59,10 @@ config ARM
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_XZ
-	select HAVE_KPROBES if !XIP_KERNEL
+	select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
-	select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
+	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
@@ -173,7 +173,7 @@ config LOCKDEP_SUPPORT
 
 config TRACE_IRQFLAGS_SUPPORT
 	bool
-	default y
+	default !CPU_V7M
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
@@ -1010,11 +1010,6 @@ config PLAT_PXA
 config PLAT_VERSATILE
 	bool
 
-config ARM_TIMER_SP804
-	bool
-	select CLKSRC_MMIO
-	select CLKSRC_OF if OF
-
 source "arch/arm/firmware/Kconfig"
 
 source arch/arm/mm/Kconfig
@@ -1342,6 +1337,7 @@ config SMP
 	depends on GENERIC_CLOCKEVENTS
 	depends on HAVE_SMP
 	depends on MMU || ARM_MPU
+	select IRQ_WORK
 	help
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, say N. If you have a system with more
@@ -1717,6 +1713,21 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 config ARCH_WANT_GENERAL_HUGETLB
 	def_bool y
 
+config ARM_MODULE_PLTS
+	bool "Use PLTs to allow module memory to spill over into vmalloc area"
+	depends on MODULES
+	help
+	  Allocate PLTs when loading modules so that jumps and calls whose
+	  targets are too far away for their relative offsets to be encoded
+	  in the instructions themselves can be bounced via veneers in the
+	  module's PLT. This allows modules to be allocated in the generic
+	  vmalloc area after the dedicated module memory area has been
+	  exhausted. The modules will use slightly more memory, but after
+	  rounding up to page size, the actual memory footprint is usually
+	  the same.
+
+	  Say y if you are getting out of memory errors while loading modules
+
 source "mm/Kconfig"
 
 config FORCE_MAX_ZONEORDER
@@ -1987,6 +1998,7 @@ config XIP_PHYS_ADDR
 config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on (!SMP || PM_SLEEP_SMP)
+	depends on !CPU_V7M
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index a6b5d0e35968..f1b157971366 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -5,6 +5,7 @@ source "lib/Kconfig.debug"
 config ARM_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
+	depends on MMU
 	select DEBUG_FS
 	---help---
 	  Say Y here if you want to show the kernel pagetable layout in a
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 2a4fae7e9c44..07ab3d203916 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -19,6 +19,10 @@ LDFLAGS_vmlinux	+= --be8
 LDFLAGS_MODULE	+= --be8
 endif
 
+ifeq ($(CONFIG_ARM_MODULE_PLTS),y)
+LDFLAGS_MODULE	+= -T $(srctree)/arch/arm/kernel/module.lds
+endif
+
 OBJCOPYFLAGS	:=-O binary -R .comment -S
 GZFLAGS		:=-9
 #KBUILD_CFLAGS	+=-pipe
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 6e1fb2b2ecc7..7a13aebacf81 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -103,6 +103,8 @@ extra-y       += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern piggy.lz4 \
 		 lib1funcs.S ashldi3.S bswapsdi2.S $(libfdt) $(libfdt_hdrs) \
 		 hyp-stub.S
 
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+
 ifeq ($(CONFIG_FUNCTION_TRACER),y)
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 2c45b5709fa4..06e983f59980 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -130,7 +130,7 @@ start:
 		.endr
    ARM(		mov	r0, r0		)
    ARM(		b	1f		)
- THUMB(		adr	r12, BSYM(1f)	)
+ THUMB(		badr	r12, 1f		)
  THUMB(		bx	r12		)
 
 		.word	_magic_sig	@ Magic numbers to help the loader
@@ -447,7 +447,7 @@ dtb_check_done:
 
 		bl	cache_clean_flush
 
-		adr	r0, BSYM(restart)
+		badr	r0, restart
 		add	r0, r0, r6
 		mov	pc, r0
 
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 70b1eff477b3..6ee5959a813b 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -11,7 +11,6 @@ obj-$(CONFIG_SHARP_LOCOMO)	+= locomo.o
 obj-$(CONFIG_SHARP_PARAM)	+= sharpsl_param.o
 obj-$(CONFIG_SHARP_SCOOP)	+= scoop.o
 obj-$(CONFIG_PCI_HOST_ITE8152)  += it8152.o
-obj-$(CONFIG_ARM_TIMER_SP804)	+= timer-sp.o
 obj-$(CONFIG_MCPM)		+= mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
 CFLAGS_REMOVE_mcpm_entry.o	= -pg
 AFLAGS_mcpm_head.o		:= -march=armv7-a
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index 5f8a52ac7edf..a923524d1040 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -20,6 +20,126 @@
 #include <asm/cputype.h>
 #include <asm/suspend.h>
 
+/*
+ * The public API for this code is documented in arch/arm/include/asm/mcpm.h.
+ * For a comprehensive description of the main algorithm used here, please
+ * see Documentation/arm/cluster-pm-race-avoidance.txt.
+ */
+
+struct sync_struct mcpm_sync;
+
+/*
+ * __mcpm_cpu_going_down: Indicates that the cpu is being torn down.
+ *    This must be called at the point of committing to teardown of a CPU.
+ *    The CPU cache (SCTRL.C bit) is expected to still be active.
+ */
+static void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster)
+{
+	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+}
+
+/*
+ * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the
+ *    cluster can be torn down without disrupting this CPU.
+ *    To avoid deadlocks, this must be called before a CPU is powered down.
+ *    The CPU cache (SCTRL.C bit) is expected to be off.
+ *    However L2 cache might or might not be active.
+ */
+static void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster)
+{
+	dmb();
+	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+	sev();
+}
+
+/*
+ * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section.
+ * @state: the final state of the cluster:
+ *     CLUSTER_UP: no destructive teardown was done and the cluster has been
+ *         restored to the previous state (CPU cache still active); or
+ *     CLUSTER_DOWN: the cluster has been torn-down, ready for power-off
+ *         (CPU cache disabled, L2 cache either enabled or disabled).
+ */
+static void __mcpm_outbound_leave_critical(unsigned int cluster, int state)
+{
+	dmb();
+	mcpm_sync.clusters[cluster].cluster = state;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cluster);
+	sev();
+}
+
+/*
+ * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section.
+ * This function should be called by the last man, after local CPU teardown
+ * is complete.  CPU cache expected to be active.
+ *
+ * Returns:
+ *     false: the critical section was not entered because an inbound CPU was
+ *         observed, or the cluster is already being set up;
+ *     true: the critical section was entered: it is now safe to tear down the
+ *         cluster.
+ */
+static bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster)
+{
+	unsigned int i;
+	struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster];
+
+	/* Warn inbound CPUs that the cluster is being torn down: */
+	c->cluster = CLUSTER_GOING_DOWN;
+	sync_cache_w(&c->cluster);
+
+	/* Back out if the inbound cluster is already in the critical region: */
+	sync_cache_r(&c->inbound);
+	if (c->inbound == INBOUND_COMING_UP)
+		goto abort;
+
+	/*
+	 * Wait for all CPUs to get out of the GOING_DOWN state, so that local
+	 * teardown is complete on each CPU before tearing down the cluster.
+	 *
+	 * If any CPU has been woken up again from the DOWN state, then we
+	 * shouldn't be taking the cluster down at all: abort in that case.
+	 */
+	sync_cache_r(&c->cpus);
+	for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) {
+		int cpustate;
+
+		if (i == cpu)
+			continue;
+
+		while (1) {
+			cpustate = c->cpus[i].cpu;
+			if (cpustate != CPU_GOING_DOWN)
+				break;
+
+			wfe();
+			sync_cache_r(&c->cpus[i].cpu);
+		}
+
+		switch (cpustate) {
+		case CPU_DOWN:
+			continue;
+
+		default:
+			goto abort;
+		}
+	}
+
+	return true;
+
+abort:
+	__mcpm_outbound_leave_critical(cluster, CLUSTER_UP);
+	return false;
+}
+
+static int __mcpm_cluster_state(unsigned int cluster)
+{
+	sync_cache_r(&mcpm_sync.clusters[cluster].cluster);
+	return mcpm_sync.clusters[cluster].cluster;
+}
+
 extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
@@ -78,16 +198,11 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster)
 	bool cpu_is_down, cluster_is_down;
 	int ret = 0;
 
+	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
 	if (!platform_ops)
 		return -EUNATCH; /* try not to shadow power_up errors */
 	might_sleep();
 
-	/* backward compatibility callback */
-	if (platform_ops->power_up)
-		return platform_ops->power_up(cpu, cluster);
-
-	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
-
 	/*
 	 * Since this is called with IRQs enabled, and no arch_spin_lock_irq
 	 * variant exists, we need to disable IRQs manually here.
@@ -128,29 +243,17 @@ void mcpm_cpu_power_down(void)
 	bool cpu_going_down, last_man;
 	phys_reset_t phys_reset;
 
-	if (WARN_ON_ONCE(!platform_ops))
-	       return;
-	BUG_ON(!irqs_disabled());
-
-	/*
-	 * Do this before calling into the power_down method,
-	 * as it might not always be safe to do afterwards.
-	 */
-	setup_mm_for_reboot();
-
-	/* backward compatibility callback */
-	if (platform_ops->power_down) {
-		platform_ops->power_down();
-		goto not_dead;
-	}
-
 	mpidr = read_cpuid_mpidr();
 	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 	pr_debug("%s: cpu %u cluster %u\n", __func__, cpu, cluster);
+	if (WARN_ON_ONCE(!platform_ops))
+	       return;
+	BUG_ON(!irqs_disabled());
+
+	setup_mm_for_reboot();
 
 	__mcpm_cpu_going_down(cpu, cluster);
-
 	arch_spin_lock(&mcpm_lock);
 	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
 
@@ -187,7 +290,6 @@ void mcpm_cpu_power_down(void)
 	if (cpu_going_down)
 		wfi();
 
-not_dead:
 	/*
 	 * It is possible for a power_up request to happen concurrently
 	 * with a power_down request for the same CPU. In this case the
@@ -219,22 +321,11 @@ int mcpm_wait_for_cpu_powerdown(unsigned int cpu, unsigned int cluster)
 	return ret;
 }
 
-void mcpm_cpu_suspend(u64 expected_residency)
+void mcpm_cpu_suspend(void)
 {
 	if (WARN_ON_ONCE(!platform_ops))
 		return;
 
-	/* backward compatibility callback */
-	if (platform_ops->suspend) {
-		phys_reset_t phys_reset;
-		BUG_ON(!irqs_disabled());
-		setup_mm_for_reboot();
-		platform_ops->suspend(expected_residency);
-		phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
-		phys_reset(virt_to_phys(mcpm_entry_point));
-		BUG();
-	}
-
 	/* Some platforms might have to enable special resume modes, etc. */
 	if (platform_ops->cpu_suspend_prepare) {
 		unsigned int mpidr = read_cpuid_mpidr();
@@ -256,12 +347,6 @@ int mcpm_cpu_powered_up(void)
 	if (!platform_ops)
 		return -EUNATCH;
 
-	/* backward compatibility callback */
-	if (platform_ops->powered_up) {
-		platform_ops->powered_up();
-		return 0;
-	}
-
 	mpidr = read_cpuid_mpidr();
 	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
@@ -334,120 +419,6 @@ int __init mcpm_loopback(void (*cache_disable)(void))
 
 #endif
 
-struct sync_struct mcpm_sync;
-
-/*
- * __mcpm_cpu_going_down: Indicates that the cpu is being torn down.
- *    This must be called at the point of committing to teardown of a CPU.
- *    The CPU cache (SCTRL.C bit) is expected to still be active.
- */
-void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster)
-{
-	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN;
-	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
-}
-
-/*
- * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the
- *    cluster can be torn down without disrupting this CPU.
- *    To avoid deadlocks, this must be called before a CPU is powered down.
- *    The CPU cache (SCTRL.C bit) is expected to be off.
- *    However L2 cache might or might not be active.
- */
-void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster)
-{
-	dmb();
-	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN;
-	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
-	sev();
-}
-
-/*
- * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section.
- * @state: the final state of the cluster:
- *     CLUSTER_UP: no destructive teardown was done and the cluster has been
- *         restored to the previous state (CPU cache still active); or
- *     CLUSTER_DOWN: the cluster has been torn-down, ready for power-off
- *         (CPU cache disabled, L2 cache either enabled or disabled).
- */
-void __mcpm_outbound_leave_critical(unsigned int cluster, int state)
-{
-	dmb();
-	mcpm_sync.clusters[cluster].cluster = state;
-	sync_cache_w(&mcpm_sync.clusters[cluster].cluster);
-	sev();
-}
-
-/*
- * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section.
- * This function should be called by the last man, after local CPU teardown
- * is complete.  CPU cache expected to be active.
- *
- * Returns:
- *     false: the critical section was not entered because an inbound CPU was
- *         observed, or the cluster is already being set up;
- *     true: the critical section was entered: it is now safe to tear down the
- *         cluster.
- */
-bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster)
-{
-	unsigned int i;
-	struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster];
-
-	/* Warn inbound CPUs that the cluster is being torn down: */
-	c->cluster = CLUSTER_GOING_DOWN;
-	sync_cache_w(&c->cluster);
-
-	/* Back out if the inbound cluster is already in the critical region: */
-	sync_cache_r(&c->inbound);
-	if (c->inbound == INBOUND_COMING_UP)
-		goto abort;
-
-	/*
-	 * Wait for all CPUs to get out of the GOING_DOWN state, so that local
-	 * teardown is complete on each CPU before tearing down the cluster.
-	 *
-	 * If any CPU has been woken up again from the DOWN state, then we
-	 * shouldn't be taking the cluster down at all: abort in that case.
-	 */
-	sync_cache_r(&c->cpus);
-	for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) {
-		int cpustate;
-
-		if (i == cpu)
-			continue;
-
-		while (1) {
-			cpustate = c->cpus[i].cpu;
-			if (cpustate != CPU_GOING_DOWN)
-				break;
-
-			wfe();
-			sync_cache_r(&c->cpus[i].cpu);
-		}
-
-		switch (cpustate) {
-		case CPU_DOWN:
-			continue;
-
-		default:
-			goto abort;
-		}
-	}
-
-	return true;
-
-abort:
-	__mcpm_outbound_leave_critical(cluster, CLUSTER_UP);
-	return false;
-}
-
-int __mcpm_cluster_state(unsigned int cluster)
-{
-	sync_cache_r(&mcpm_sync.clusters[cluster].cluster);
-	return mcpm_sync.clusters[cluster].cluster;
-}
-
 extern unsigned long mcpm_power_up_setup_phys;
 
 int __init mcpm_sync_init(
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index e02db4b81a66..08b3bb9bc6a2 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -49,7 +49,7 @@
 ENTRY(mcpm_entry_point)
 
  ARM_BE8(setend        be)
- THUMB(	adr	r12, BSYM(1f)	)
+ THUMB(	badr	r12, 1f		)
  THUMB(	bx	r12		)
  THUMB(	.thumb			)
 1:
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 186270b3e194..4abe57279c66 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -177,6 +177,21 @@
 	restore_irqs_notrace \oldcpsr
 	.endm
 
+/*
+ * Assembly version of "adr rd, BSYM(sym)".  This should only be used to
+ * reference local symbols in the same assembly file which are to be
+ * resolved by the assembler.  Other usage is undefined.
+ */
+	.irp	c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+	.macro	badr\c, rd, sym
+#ifdef CONFIG_THUMB2_KERNEL
+	adr\c	\rd, \sym + 1
+#else
+	adr\c	\rd, \sym
+#endif
+	.endm
+	.endr
+
 /*
  * Get current thread_info.
  */
@@ -326,7 +341,7 @@
 THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	bne	1f
 	orr	\reg, \reg, #PSR_A_BIT
-	adr	lr, BSYM(2f)
+	badr	lr, 2f
 	msr	spsr_cxsf, \reg
 	__MSR_ELR_HYP(14)
 	__ERET
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2d46862e7bef..4812cda8fd17 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -482,10 +482,17 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
 	: : : "r0","r1","r2","r3","r4","r5","r6","r7", \
 	      "r9","r10","lr","memory" )
 
+#ifdef CONFIG_MMU
 int set_memory_ro(unsigned long addr, int numpages);
 int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
+#else
+static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+#endif
 
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index abb2c3769b01..1692a05d3207 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -94,6 +94,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 		break;
 #endif
 	default:
+		/* Cause a link-time error, the xchg() size is not supported */
 		__bad_xchg(ptr, size), ret = 0;
 		break;
 	}
@@ -102,8 +103,10 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 	return ret;
 }
 
-#define xchg(ptr,x) \
-	((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+#define xchg(ptr, x) ({							\
+	(__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr),		\
+				   sizeof(*(ptr)));			\
+})
 
 #include <asm-generic/cmpxchg-local.h>
 
@@ -118,14 +121,16 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
  * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make
  * them available.
  */
-#define cmpxchg_local(ptr, o, n)				  	       \
-	((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), (unsigned long)(o),\
-			(unsigned long)(n), sizeof(*(ptr))))
+#define cmpxchg_local(ptr, o, n) ({					\
+	(__typeof(*ptr))__cmpxchg_local_generic((ptr),			\
+					        (unsigned long)(o),	\
+					        (unsigned long)(n),	\
+					        sizeof(*(ptr)));	\
+})
+
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 
-#ifndef CONFIG_SMP
 #include <asm-generic/cmpxchg.h>
-#endif
 
 #else	/* min ARCH >= ARMv6 */
 
@@ -201,11 +206,12 @@ static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old,
 	return ret;
 }
 
-#define cmpxchg(ptr,o,n)						\
-	((__typeof__(*(ptr)))__cmpxchg_mb((ptr),			\
-					  (unsigned long)(o),		\
-					  (unsigned long)(n),		\
-					  sizeof(*(ptr))))
+#define cmpxchg(ptr,o,n) ({						\
+	(__typeof__(*(ptr)))__cmpxchg_mb((ptr),				\
+					 (unsigned long)(o),		\
+					 (unsigned long)(n),		\
+					 sizeof(*(ptr)));		\
+})
 
 static inline unsigned long __cmpxchg_local(volatile void *ptr,
 					    unsigned long old,
@@ -227,6 +233,13 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 	return ret;
 }
 
+#define cmpxchg_local(ptr, o, n) ({					\
+	(__typeof(*ptr))__cmpxchg_local((ptr),				\
+				        (unsigned long)(o),		\
+				        (unsigned long)(n),		\
+				        sizeof(*(ptr)));		\
+})
+
 static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
 					     unsigned long long old,
 					     unsigned long long new)
@@ -252,6 +265,14 @@ static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
 	return oldval;
 }
 
+#define cmpxchg64_relaxed(ptr, o, n) ({					\
+	(__typeof__(*(ptr)))__cmpxchg64((ptr),				\
+					(unsigned long long)(o),	\
+					(unsigned long long)(n));	\
+})
+
+#define cmpxchg64_local(ptr, o, n) cmpxchg64_relaxed((ptr), (o), (n))
+
 static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
 						unsigned long long old,
 						unsigned long long new)
@@ -265,23 +286,11 @@ static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
 	return ret;
 }
 
-#define cmpxchg_local(ptr,o,n)						\
-	((__typeof__(*(ptr)))__cmpxchg_local((ptr),			\
-				       (unsigned long)(o),		\
-				       (unsigned long)(n),		\
-				       sizeof(*(ptr))))
-
-#define cmpxchg64(ptr, o, n)						\
-	((__typeof__(*(ptr)))__cmpxchg64_mb((ptr),			\
-					(unsigned long long)(o),	\
-					(unsigned long long)(n)))
-
-#define cmpxchg64_relaxed(ptr, o, n)					\
-	((__typeof__(*(ptr)))__cmpxchg64((ptr),				\
-					(unsigned long long)(o),	\
-					(unsigned long long)(n)))
-
-#define cmpxchg64_local(ptr, o, n)	cmpxchg64_relaxed((ptr), (o), (n))
+#define cmpxchg64(ptr, o, n) ({						\
+	(__typeof__(*(ptr)))__cmpxchg64_mb((ptr),			\
+					   (unsigned long long)(o),	\
+					   (unsigned long long)(n));	\
+})
 
 #endif	/* __LINUX_ARM_ARCH__ >= 6 */
 
diff --git a/arch/arm/include/asm/entry-macro-multi.S b/arch/arm/include/asm/entry-macro-multi.S
index 469a2b30fa27..609184f522ee 100644
--- a/arch/arm/include/asm/entry-macro-multi.S
+++ b/arch/arm/include/asm/entry-macro-multi.S
@@ -10,7 +10,7 @@
 	@
 	@ routine called with r0 = irq number, r1 = struct pt_regs *
 	@
-	adrne	lr, BSYM(1b)
+	badrne	lr, 1b
 	bne	asm_do_IRQ
 
 #ifdef CONFIG_SMP
@@ -23,7 +23,7 @@
 	ALT_SMP(test_for_ipi r0, r2, r6, lr)
 	ALT_UP_B(9997f)
 	movne	r1, sp
-	adrne	lr, BSYM(1b)
+	badrne	lr, 1b
 	bne	do_IPI
 #endif
 9997:
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 1b7677d1e5e1..1c3938f26beb 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -23,6 +23,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/string.h>
 #include <linux/types.h>
 #include <linux/blk_types.h>
 #include <asm/byteorder.h>
@@ -73,17 +74,16 @@ void __raw_readsl(const volatile void __iomem *addr, void *data, int longlen);
 static inline void __raw_writew(u16 val, volatile void __iomem *addr)
 {
 	asm volatile("strh %1, %0"
-		     : "+Q" (*(volatile u16 __force *)addr)
-		     : "r" (val));
+		     : : "Q" (*(volatile u16 __force *)addr), "r" (val));
 }
 
 #define __raw_readw __raw_readw
 static inline u16 __raw_readw(const volatile void __iomem *addr)
 {
 	u16 val;
-	asm volatile("ldrh %1, %0"
-		     : "+Q" (*(volatile u16 __force *)addr),
-		       "=r" (val));
+	asm volatile("ldrh %0, %1"
+		     : "=r" (val)
+		     : "Q" (*(volatile u16 __force *)addr));
 	return val;
 }
 #endif
@@ -92,25 +92,23 @@ static inline u16 __raw_readw(const volatile void __iomem *addr)
 static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
 {
 	asm volatile("strb %1, %0"
-		     : "+Qo" (*(volatile u8 __force *)addr)
-		     : "r" (val));
+		     : : "Qo" (*(volatile u8 __force *)addr), "r" (val));
 }
 
 #define __raw_writel __raw_writel
 static inline void __raw_writel(u32 val, volatile void __iomem *addr)
 {
 	asm volatile("str %1, %0"
-		     : "+Qo" (*(volatile u32 __force *)addr)
-		     : "r" (val));
+		     : : "Qo" (*(volatile u32 __force *)addr), "r" (val));
 }
 
 #define __raw_readb __raw_readb
 static inline u8 __raw_readb(const volatile void __iomem *addr)
 {
 	u8 val;
-	asm volatile("ldrb %1, %0"
-		     : "+Qo" (*(volatile u8 __force *)addr),
-		       "=r" (val));
+	asm volatile("ldrb %0, %1"
+		     : "=r" (val)
+		     : "Qo" (*(volatile u8 __force *)addr));
 	return val;
 }
 
@@ -118,9 +116,9 @@ static inline u8 __raw_readb(const volatile void __iomem *addr)
 static inline u32 __raw_readl(const volatile void __iomem *addr)
 {
 	u32 val;
-	asm volatile("ldr %1, %0"
-		     : "+Qo" (*(volatile u32 __force *)addr),
-		       "=r" (val));
+	asm volatile("ldr %0, %1"
+		     : "=r" (val)
+		     : "Qo" (*(volatile u32 __force *)addr));
 	return val;
 }
 
@@ -319,9 +317,33 @@ extern void _memset_io(volatile void __iomem *, int, size_t);
 #define writesw(p,d,l)		__raw_writesw(p,d,l)
 #define writesl(p,d,l)		__raw_writesl(p,d,l)
 
+#ifndef __ARMBE__
+static inline void memset_io(volatile void __iomem *dst, unsigned c,
+	size_t count)
+{
+	memset((void __force *)dst, c, count);
+}
+#define memset_io(dst,c,count) memset_io(dst,c,count)
+
+static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
+	size_t count)
+{
+	memcpy(to, (const void __force *)from, count);
+}
+#define memcpy_fromio(to,from,count) memcpy_fromio(to,from,count)
+
+static inline void memcpy_toio(volatile void __iomem *to, const void *from,
+	size_t count)
+{
+	memcpy((void __force *)to, from, count);
+}
+#define memcpy_toio(to,from,count) memcpy_toio(to,from,count)
+
+#else
 #define memset_io(c,v,l)	_memset_io(c,(v),(l))
 #define memcpy_fromio(a,c,l)	_memcpy_fromio((a),c,(l))
 #define memcpy_toio(c,a,l)	_memcpy_toio(c,(a),(l))
+#endif
 
 #endif	/* readl */
 
diff --git a/arch/arm/include/asm/irqflags.h b/arch/arm/include/asm/irqflags.h
index 3b763d6652a0..43908146a5cf 100644
--- a/arch/arm/include/asm/irqflags.h
+++ b/arch/arm/include/asm/irqflags.h
@@ -20,6 +20,7 @@
 
 #if __LINUX_ARM_ARCH__ >= 6
 
+#define arch_local_irq_save arch_local_irq_save
 static inline unsigned long arch_local_irq_save(void)
 {
 	unsigned long flags;
@@ -31,6 +32,7 @@ static inline unsigned long arch_local_irq_save(void)
 	return flags;
 }
 
+#define arch_local_irq_enable arch_local_irq_enable
 static inline void arch_local_irq_enable(void)
 {
 	asm volatile(
@@ -40,6 +42,7 @@ static inline void arch_local_irq_enable(void)
 		: "memory", "cc");
 }
 
+#define arch_local_irq_disable arch_local_irq_disable
 static inline void arch_local_irq_disable(void)
 {
 	asm volatile(
@@ -56,6 +59,7 @@ static inline void arch_local_irq_disable(void)
 /*
  * Save the current interrupt enable state & disable IRQs
  */
+#define arch_local_irq_save arch_local_irq_save
 static inline unsigned long arch_local_irq_save(void)
 {
 	unsigned long flags, temp;
@@ -73,6 +77,7 @@ static inline unsigned long arch_local_irq_save(void)
 /*
  * Enable IRQs
  */
+#define arch_local_irq_enable arch_local_irq_enable
 static inline void arch_local_irq_enable(void)
 {
 	unsigned long temp;
@@ -88,6 +93,7 @@ static inline void arch_local_irq_enable(void)
 /*
  * Disable IRQs
  */
+#define arch_local_irq_disable arch_local_irq_disable
 static inline void arch_local_irq_disable(void)
 {
 	unsigned long temp;
@@ -135,6 +141,7 @@ static inline void arch_local_irq_disable(void)
 /*
  * Save the current interrupt enable state.
  */
+#define arch_local_save_flags arch_local_save_flags
 static inline unsigned long arch_local_save_flags(void)
 {
 	unsigned long flags;
@@ -147,6 +154,7 @@ static inline unsigned long arch_local_save_flags(void)
 /*
  * restore saved IRQ & FIQ state
  */
+#define arch_local_irq_restore arch_local_irq_restore
 static inline void arch_local_irq_restore(unsigned long flags)
 {
 	asm volatile(
@@ -156,10 +164,13 @@ static inline void arch_local_irq_restore(unsigned long flags)
 		: "memory", "cc");
 }
 
+#define arch_irqs_disabled_flags arch_irqs_disabled_flags
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
 	return flags & IRQMASK_I_BIT;
 }
 
+#include <asm-generic/irqflags.h>
+
 #endif /* ifdef __KERNEL__ */
 #endif /* ifndef __ASM_ARM_IRQFLAGS_H */
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 0406cb3f1af7..cb3a40717edd 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -51,7 +51,7 @@ struct machine_desc {
 	bool			(*smp_init)(void);
 	void			(*fixup)(struct tag *, char **);
 	void			(*dt_fixup)(void);
-	void			(*init_meminfo)(void);
+	long long		(*pv_fixup)(void);
 	void			(*reserve)(void);/* reserve mem blocks	*/
 	void			(*map_io)(void);/* IO mapping function	*/
 	void			(*init_early)(void);
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
index 50b378f59e08..acd4983d9b1f 100644
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -137,17 +137,12 @@ int mcpm_wait_for_cpu_powerdown(unsigned int cpu, unsigned int cluster);
 /**
  * mcpm_cpu_suspend - bring the calling CPU in a suspended state
  *
- * @expected_residency: duration in microseconds the CPU is expected
- *			to remain suspended, or 0 if unknown/infinity.
- *
- * The calling CPU is suspended.  The expected residency argument is used
- * as a hint by the platform specific backend to implement the appropriate
- * sleep state level according to the knowledge it has on wake-up latency
- * for the given hardware.
+ * The calling CPU is suspended.  This is similar to mcpm_cpu_power_down()
+ * except for possible extra platform specific configuration steps to allow
+ * an asynchronous wake-up e.g. with a pending interrupt.
  *
  * If this CPU is found to be the "last man standing" in the cluster
- * then the cluster may be prepared for power-down too, if the expected
- * residency makes it worthwhile.
+ * then the cluster may be prepared for power-down too.
  *
  * This must be called with interrupts disabled.
  *
@@ -157,7 +152,7 @@ int mcpm_wait_for_cpu_powerdown(unsigned int cpu, unsigned int cluster);
  * This will return if mcpm_platform_register() has not been called
  * previously in which case the caller should take appropriate action.
  */
-void mcpm_cpu_suspend(u64 expected_residency);
+void mcpm_cpu_suspend(void);
 
 /**
  * mcpm_cpu_powered_up - housekeeping workafter a CPU has been powered up
@@ -234,12 +229,6 @@ struct mcpm_platform_ops {
 	void (*cpu_is_up)(unsigned int cpu, unsigned int cluster);
 	void (*cluster_is_up)(unsigned int cluster);
 	int (*wait_for_powerdown)(unsigned int cpu, unsigned int cluster);
-
-	/* deprecated callbacks */
-	int (*power_up)(unsigned int cpu, unsigned int cluster);
-	void (*power_down)(void);
-	void (*suspend)(u64);
-	void (*powered_up)(void);
 };
 
 /**
@@ -251,35 +240,6 @@ struct mcpm_platform_ops {
  */
 int __init mcpm_platform_register(const struct mcpm_platform_ops *ops);
 
-/* Synchronisation structures for coordinating safe cluster setup/teardown: */
-
-/*
- * When modifying this structure, make sure you update the MCPM_SYNC_ defines
- * to match.
- */
-struct mcpm_sync_struct {
-	/* individual CPU states */
-	struct {
-		s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE);
-	} cpus[MAX_CPUS_PER_CLUSTER];
-
-	/* cluster state */
-	s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE);
-
-	/* inbound-side state */
-	s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE);
-};
-
-struct sync_struct {
-	struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS];
-};
-
-void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster);
-void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster);
-void __mcpm_outbound_leave_critical(unsigned int cluster, int state);
-bool __mcpm_outbound_enter_critical(unsigned int this_cpu, unsigned int cluster);
-int __mcpm_cluster_state(unsigned int cluster);
-
 /**
  * mcpm_sync_init - Initialize the cluster synchronization support
  *
@@ -318,6 +278,29 @@ int __init mcpm_loopback(void (*cache_disable)(void));
 
 void __init mcpm_smp_set_ops(void);
 
+/*
+ * Synchronisation structures for coordinating safe cluster setup/teardown.
+ * This is private to the MCPM core code and shared between C and assembly.
+ * When modifying this structure, make sure you update the MCPM_SYNC_ defines
+ * to match.
+ */
+struct mcpm_sync_struct {
+	/* individual CPU states */
+	struct {
+		s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE);
+	} cpus[MAX_CPUS_PER_CLUSTER];
+
+	/* cluster state */
+	s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE);
+
+	/* inbound-side state */
+	s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE);
+};
+
+struct sync_struct {
+	struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS];
+};
+
 #else
 
 /* 
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 184def0e1652..3a72d69b3255 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -18,8 +18,6 @@
 #include <linux/types.h>
 #include <linux/sizes.h>
 
-#include <asm/cache.h>
-
 #ifdef CONFIG_NEED_MACH_MEMORY_H
 #include <mach/memory.h>
 #endif
@@ -132,20 +130,6 @@
 #define page_to_phys(page)	(__pfn_to_phys(page_to_pfn(page)))
 #define phys_to_page(phys)	(pfn_to_page(__phys_to_pfn(phys)))
 
-/*
- * Minimum guaranted alignment in pgd_alloc().  The page table pointers passed
- * around in head.S and proc-*.S are shifted by this amount, in order to
- * leave spare high bits for systems with physical address extension.  This
- * does not fully accomodate the 40-bit addressing capability of ARM LPAE, but
- * gives us about 38-bits or so.
- */
-#ifdef CONFIG_ARM_LPAE
-#define ARCH_PGD_SHIFT		L1_CACHE_SHIFT
-#else
-#define ARCH_PGD_SHIFT		0
-#endif
-#define ARCH_PGD_MASK		((1 << ARCH_PGD_SHIFT) - 1)
-
 /*
  * PLAT_PHYS_OFFSET is the offset (from zero) of the start of physical
  * memory.  This is used for XIP and NoMMU kernels, and on platforms that don't
diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h
index ed690c49ef93..e358b7966c06 100644
--- a/arch/arm/include/asm/module.h
+++ b/arch/arm/include/asm/module.h
@@ -16,11 +16,21 @@ enum {
 	ARM_SEC_UNLIKELY,
 	ARM_SEC_MAX,
 };
+#endif
 
 struct mod_arch_specific {
+#ifdef CONFIG_ARM_UNWIND
 	struct unwind_table *unwind[ARM_SEC_MAX];
-};
 #endif
+#ifdef CONFIG_ARM_MODULE_PLTS
+	struct elf32_shdr   *core_plt;
+	struct elf32_shdr   *init_plt;
+	int		    core_plt_count;
+	int		    init_plt_count;
+#endif
+};
+
+u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val);
 
 /*
  * Add the ARM architecture version to the version magic string
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index d9cf138fd7d4..4f9dec489931 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -19,4 +19,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 #endif
 
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+	(regs)->ARM_pc = (__ip); \
+	(regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
+	(regs)->ARM_sp = current_stack_pointer; \
+	(regs)->ARM_cpsr = SVC_MODE; \
+}
+
 #endif /* __ARM_PERF_EVENT_H__ */
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index 675e4ab79f68..3fc87dfd77e6 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -24,22 +24,10 @@
  *	interrupt and passed the address of the low level handler,
  *	and can be used to implement any platform specific handling
  *	before or after calling it.
- * @runtime_resume: an optional handler which will be called by the
- *	runtime PM framework following a call to pm_runtime_get().
- *	Note that if pm_runtime_get() is called more than once in
- *	succession this handler will only be called once.
- * @runtime_suspend: an optional handler which will be called by the
- *	runtime PM framework following a call to pm_runtime_put().
- *	Note that if pm_runtime_get() is called more than once in
- *	succession this handler will only be called following the
- *	final call to pm_runtime_put() that actually disables the
- *	hardware.
  */
 struct arm_pmu_platdata {
 	irqreturn_t (*handle_irq)(int irq, void *dev,
 				  irq_handler_t pmu_handler);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_suspend)(struct device *dev);
 };
 
 #ifdef CONFIG_HW_PERF_EVENTS
@@ -92,6 +80,7 @@ struct pmu_hw_events {
 struct arm_pmu {
 	struct pmu	pmu;
 	cpumask_t	active_irqs;
+	cpumask_t	supported_cpus;
 	int		*irq_affinity;
 	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
@@ -122,8 +111,6 @@ struct arm_pmu {
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
 
-extern const struct dev_pm_ops armpmu_dev_pm_ops;
-
 int armpmu_register(struct arm_pmu *armpmu, int type);
 
 u64 armpmu_event_update(struct perf_event *event);
@@ -158,6 +145,10 @@ struct pmu_probe_info {
 #define XSCALE_PMU_PROBE(_version, _fn) \
 	PMU_PROBE(ARM_CPU_IMP_INTEL << 24 | _version, ARM_PMU_XSCALE_MASK, _fn)
 
+int arm_pmu_device_probe(struct platform_device *pdev,
+			 const struct of_device_id *of_table,
+			 const struct pmu_probe_info *probe_table);
+
 #endif /* CONFIG_HW_PERF_EVENTS */
 
 #endif /* __ARM_PMU_H__ */
diff --git a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h
index 5324c1112f3a..8877ad5ffe10 100644
--- a/arch/arm/include/asm/proc-fns.h
+++ b/arch/arm/include/asm/proc-fns.h
@@ -125,13 +125,6 @@ extern void cpu_resume(void);
 		ttbr;						\
 	})
 
-#define cpu_set_ttbr(nr, val)					\
-	do {							\
-		u64 ttbr = val;					\
-		__asm__("mcrr	p15, " #nr ", %Q0, %R0, c2"	\
-			: : "r" (ttbr));			\
-	} while (0)
-
 #define cpu_get_pgd()	\
 	({						\
 		u64 pg = cpu_get_ttbr(0);		\
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index 18f5a554134f..2f3ac1ba6fb4 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -61,7 +61,7 @@ asmlinkage void secondary_start_kernel(void);
 struct secondary_data {
 	union {
 		unsigned long mpu_rgn_szr;
-		unsigned long pgdir;
+		u64 pgdir;
 	};
 	unsigned long swapper_pg_dir;
 	void *stack;
@@ -69,6 +69,7 @@ struct secondary_data {
 extern struct secondary_data secondary_data;
 extern volatile int pen_release;
 extern void secondary_startup(void);
+extern void secondary_startup_arm(void);
 
 extern int __cpu_disable(void);
 
diff --git a/arch/arm/include/asm/system_info.h b/arch/arm/include/asm/system_info.h
index 720ea0320a6d..3860cbd401ec 100644
--- a/arch/arm/include/asm/system_info.h
+++ b/arch/arm/include/asm/system_info.h
@@ -17,6 +17,7 @@
 
 /* information about the system we're running on */
 extern unsigned int system_rev;
+extern const char *system_serial;
 extern unsigned int system_serial_low;
 extern unsigned int system_serial_high;
 extern unsigned int mem_fclk_21285;
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index 200f9a7cd623..a91ae499614c 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -45,7 +45,6 @@
 #define THUMB(x...)	x
 #ifdef __ASSEMBLY__
 #define W(instr)	instr.w
-#define BSYM(sym)	sym + 1
 #else
 #define WASM(instr)	#instr ".w"
 #endif
@@ -59,7 +58,6 @@
 #define THUMB(x...)
 #ifdef __ASSEMBLY__
 #define W(instr)	instr
-#define BSYM(sym)	sym
 #else
 #define WASM(instr)	#instr
 #endif
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 752725dcbf42..e69f7a19735d 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_CPU_IDLE)		+= cpuidle.o
 obj-$(CONFIG_ISA_DMA_API)	+= dma.o
 obj-$(CONFIG_FIQ)		+= fiq.o fiqasm.o
 obj-$(CONFIG_MODULES)		+= armksyms.o module.o
+obj-$(CONFIG_ARM_MODULE_PLTS)	+= module-plts.o
 obj-$(CONFIG_ISA_DMA)		+= dma-isa.o
 obj-$(CONFIG_PCI)		+= bios32.o isa.o
 obj-$(CONFIG_ARM_CPU_SUSPEND)	+= sleep.o suspend.o
@@ -70,7 +71,9 @@ obj-$(CONFIG_CPU_PJ4)		+= pj4-cp0.o
 obj-$(CONFIG_CPU_PJ4B)		+= pj4-cp0.o
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_regs.o perf_callchain.o
-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o perf_event_cpu.o
+obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o \
+				   perf_event_xscale.o perf_event_v6.o \
+				   perf_event_v7.o
 CFLAGS_pj4-cp0.o		:= -marm
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 570306c49406..f8f7398c74c2 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -40,7 +40,7 @@
 #ifdef CONFIG_MULTI_IRQ_HANDLER
 	ldr	r1, =handle_arch_irq
 	mov	r0, sp
-	adr	lr, BSYM(9997f)
+	badr	lr, 9997f
 	ldr	pc, [r1]
 #else
 	arch_irq_handler_default
@@ -273,7 +273,7 @@ __und_svc:
 	str	r4, [sp, #S_PC]
 	orr	r0, r9, r0, lsl #16
 #endif
-	adr	r9, BSYM(__und_svc_finish)
+	badr	r9, __und_svc_finish
 	mov	r2, r4
 	bl	call_fpe
 
@@ -469,7 +469,7 @@ __und_usr:
 	@ instruction, or the more conventional lr if we are to treat
 	@ this as a real undefined instruction
 	@
-	adr	r9, BSYM(ret_from_exception)
+	badr	r9, ret_from_exception
 
 	@ IRQs must be enabled before attempting to read the instruction from
 	@ user space since that could cause a page/translation fault if the
@@ -486,7 +486,7 @@ __und_usr:
 	@ r2 = PC value for the following instruction (:= regs->ARM_pc)
 	@ r4 = PC value for the faulting instruction
 	@ lr = 32-bit undefined instruction function
-	adr	lr, BSYM(__und_usr_fault_32)
+	badr	lr, __und_usr_fault_32
 	b	call_fpe
 
 __und_usr_thumb:
@@ -522,7 +522,7 @@ ARM_BE8(rev16	r0, r0)				@ little endian instruction
 	add	r2, r2, #2			@ r2 is PC + 2, make it PC + 4
 	str	r2, [sp, #S_PC]			@ it's a 2x16bit instr, update
 	orr	r0, r0, r5, lsl #16
-	adr	lr, BSYM(__und_usr_fault_32)
+	badr	lr, __und_usr_fault_32
 	@ r0 = the two 16-bit Thumb instructions which caused the exception
 	@ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc)
 	@ r4 = PC value for the first 16-bit Thumb instruction
@@ -716,7 +716,7 @@ __und_usr_fault_32:
 __und_usr_fault_16:
 	mov	r1, #2
 1:	mov	r0, sp
-	adr	lr, BSYM(ret_from_exception)
+	badr	lr, ret_from_exception
 	b	__und_fault
 ENDPROC(__und_usr_fault_32)
 ENDPROC(__und_usr_fault_16)
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 4e7f40c577e6..92828a1dec80 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -90,7 +90,7 @@ ENTRY(ret_from_fork)
 	bl	schedule_tail
 	cmp	r5, #0
 	movne	r0, r4
-	adrne	lr, BSYM(1f)
+	badrne	lr, 1f
 	retne	r5
 1:	get_thread_info tsk
 	b	ret_slow_syscall
@@ -198,7 +198,7 @@ local_restart:
 	bne	__sys_trace
 
 	cmp	scno, #NR_syscalls		@ check upper syscall limit
-	adr	lr, BSYM(ret_fast_syscall)	@ return address
+	badr	lr, ret_fast_syscall		@ return address
 	ldrcc	pc, [tbl, scno, lsl #2]		@ call sys_* routine
 
 	add	r1, sp, #S_OFF
@@ -233,7 +233,7 @@ __sys_trace:
 	add	r0, sp, #S_OFF
 	bl	syscall_trace_enter
 
-	adr	lr, BSYM(__sys_trace_return)	@ return address
+	badr	lr, __sys_trace_return		@ return address
 	mov	scno, r0			@ syscall number (possibly new)
 	add	r1, sp, #S_R0 + S_OFF		@ pointer to regs
 	cmp	scno, #NR_syscalls		@ check upper syscall limit
diff --git a/arch/arm/kernel/entry-ftrace.S b/arch/arm/kernel/entry-ftrace.S
index fe57c73e70a4..c73c4030ca5d 100644
--- a/arch/arm/kernel/entry-ftrace.S
+++ b/arch/arm/kernel/entry-ftrace.S
@@ -87,7 +87,7 @@
 
 1: 	mcount_get_lr	r1			@ lr of instrumented func
 	mcount_adjust_addr	r0, lr		@ instrumented function
-	adr	lr, BSYM(2f)
+	badr	lr, 2f
 	mov	pc, r2
 2:	mcount_exit
 .endm
diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S
index 8944f4991c3c..b6c8bb9315e7 100644
--- a/arch/arm/kernel/entry-v7m.S
+++ b/arch/arm/kernel/entry-v7m.S
@@ -117,9 +117,14 @@ ENTRY(__switch_to)
 ENDPROC(__switch_to)
 
 	.data
-	.align	8
+#if CONFIG_CPU_V7M_NUM_IRQ <= 112
+	.align	9
+#else
+	.align	10
+#endif
+
 /*
- * Vector table (64 words => 256 bytes natural alignment)
+ * Vector table (Natural alignment need to be ensured)
  */
 ENTRY(vector_table)
 	.long	0			@ 0 - Reset stack pointer
@@ -138,6 +143,6 @@ ENTRY(vector_table)
 	.long	__invalid_entry		@ 13 - Reserved
 	.long	__pendsv_entry		@ 14 - PendSV
 	.long	__invalid_entry		@ 15 - SysTick
-	.rept	64 - 16
-	.long	__irq_entry		@ 16..64 - External Interrupts
+	.rept	CONFIG_CPU_V7M_NUM_IRQ
+	.long	__irq_entry		@ External Interrupts
 	.endr
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S
index aebfbf79a1a3..9b8c5a113434 100644
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -46,7 +46,7 @@ ENTRY(stext)
 	.arm
 ENTRY(stext)
 
- THUMB(	adr	r9, BSYM(1f)	)	@ Kernel is always entered in ARM.
+ THUMB(	badr	r9, 1f		)	@ Kernel is always entered in ARM.
  THUMB(	bx	r9		)	@ If this is a Thumb-2 kernel,
  THUMB(	.thumb			)	@ switch to Thumb now.
  THUMB(1:			)
@@ -77,13 +77,13 @@ ENTRY(stext)
 	orr	r6, r6, #(1 << MPU_RSR_EN)	@ Set region enabled bit
 	bl	__setup_mpu
 #endif
-	ldr	r13, =__mmap_switched		@ address to jump to after
-						@ initialising sctlr
-	adr	lr, BSYM(1f)			@ return (PIC) address
+
+	badr	lr, 1f				@ return (PIC) address
 	ldr	r12, [r10, #PROCINFO_INITFUNC]
 	add	r12, r12, r10
 	ret	r12
- 1:	b	__after_proc_init
+1:	bl	__after_proc_init
+	b	__mmap_switched
 ENDPROC(stext)
 
 #ifdef CONFIG_SMP
@@ -106,8 +106,7 @@ ENTRY(secondary_startup)
 	movs	r10, r5				@ invalid processor?
 	beq	__error_p			@ yes, error 'p'
 
-	adr	r4, __secondary_data
-	ldmia	r4, {r7, r12}
+	ldr	r7, __secondary_data
 
 #ifdef CONFIG_ARM_MPU
 	/* Use MPU region info supplied by __cpu_up */
@@ -115,23 +114,19 @@ ENTRY(secondary_startup)
 	bl      __setup_mpu			@ Initialize the MPU
 #endif
 
-	adr	lr, BSYM(__after_proc_init)	@ return address
-	mov	r13, r12			@ __secondary_switched address
+	badr	lr, 1f				@ return (PIC) address
 	ldr	r12, [r10, #PROCINFO_INITFUNC]
 	add	r12, r12, r10
 	ret	r12
-ENDPROC(secondary_startup)
-
-ENTRY(__secondary_switched)
-	ldr	sp, [r7, #8]			@ set up the stack pointer
+1:	bl	__after_proc_init
+	ldr	sp, [r7, #12]			@ set up the stack pointer
 	mov	fp, #0
 	b	secondary_start_kernel
-ENDPROC(__secondary_switched)
+ENDPROC(secondary_startup)
 
 	.type	__secondary_data, %object
 __secondary_data:
 	.long	secondary_data
-	.long	__secondary_switched
 #endif /* CONFIG_SMP */
 
 /*
@@ -164,7 +159,7 @@ __after_proc_init:
 #endif
 	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
 #endif /* CONFIG_CPU_CP15 */
-	ret	r13
+	ret	lr
 ENDPROC(__after_proc_init)
 	.ltorg
 
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 3637973a9708..bd755d97e459 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -80,7 +80,7 @@
 ENTRY(stext)
  ARM_BE8(setend	be )			@ ensure we are in BE8 mode
 
- THUMB(	adr	r9, BSYM(1f)	)	@ Kernel is always entered in ARM.
+ THUMB(	badr	r9, 1f		)	@ Kernel is always entered in ARM.
  THUMB(	bx	r9		)	@ If this is a Thumb-2 kernel,
  THUMB(	.thumb			)	@ switch to Thumb now.
  THUMB(1:			)
@@ -131,13 +131,30 @@ ENTRY(stext)
 	 * The following calls CPU specific code in a position independent
 	 * manner.  See arch/arm/mm/proc-*.S for details.  r10 = base of
 	 * xxx_proc_info structure selected by __lookup_processor_type
-	 * above.  On return, the CPU will be ready for the MMU to be
-	 * turned on, and r0 will hold the CPU control register value.
+	 * above.
+	 *
+	 * The processor init function will be called with:
+	 *  r1 - machine type
+	 *  r2 - boot data (atags/dt) pointer
+	 *  r4 - translation table base (low word)
+	 *  r5 - translation table base (high word, if LPAE)
+	 *  r8 - translation table base 1 (pfn if LPAE)
+	 *  r9 - cpuid
+	 *  r13 - virtual address for __enable_mmu -> __turn_mmu_on
+	 *
+	 * On return, the CPU will be ready for the MMU to be turned on,
+	 * r0 will hold the CPU control register value, r1, r2, r4, and
+	 * r9 will be preserved.  r5 will also be preserved if LPAE.
 	 */
 	ldr	r13, =__mmap_switched		@ address to jump to after
 						@ mmu has been enabled
-	adr	lr, BSYM(1f)			@ return (PIC) address
+	badr	lr, 1f				@ return (PIC) address
+#ifdef CONFIG_ARM_LPAE
+	mov	r5, #0				@ high TTBR0
+	mov	r8, r4, lsr #12			@ TTBR1 is swapper_pg_dir pfn
+#else
 	mov	r8, r4				@ set TTBR1 to swapper_pg_dir
+#endif
 	ldr	r12, [r10, #PROCINFO_INITFUNC]
 	add	r12, r12, r10
 	ret	r12
@@ -158,7 +175,7 @@ ENDPROC(stext)
  *
  * Returns:
  *  r0, r3, r5-r7 corrupted
- *  r4 = page table (see ARCH_PGD_SHIFT in asm/memory.h)
+ *  r4 = physical page table address
  */
 __create_page_tables:
 	pgtbl	r4, r8				@ page table address
@@ -333,7 +350,6 @@ __create_page_tables:
 #endif
 #ifdef CONFIG_ARM_LPAE
 	sub	r4, r4, #0x1000		@ point to the PGD table
-	mov	r4, r4, lsr #ARCH_PGD_SHIFT
 #endif
 	ret	lr
 ENDPROC(__create_page_tables)
@@ -346,9 +362,9 @@ __turn_mmu_on_loc:
 
 #if defined(CONFIG_SMP)
 	.text
-ENTRY(secondary_startup_arm)
 	.arm
- THUMB(	adr	r9, BSYM(1f)	)	@ Kernel is entered in ARM.
+ENTRY(secondary_startup_arm)
+ THUMB(	badr	r9, 1f		)	@ Kernel is entered in ARM.
  THUMB(	bx	r9		)	@ If this is a Thumb-2 kernel,
  THUMB(	.thumb			)	@ switch to Thumb now.
  THUMB(1:			)
@@ -381,10 +397,10 @@ ENTRY(secondary_startup)
 	adr	r4, __secondary_data
 	ldmia	r4, {r5, r7, r12}		@ address to jump to after
 	sub	lr, r4, r5			@ mmu has been enabled
-	ldr	r4, [r7, lr]			@ get secondary_data.pgdir
-	add	r7, r7, #4
-	ldr	r8, [r7, lr]			@ get secondary_data.swapper_pg_dir
-	adr	lr, BSYM(__enable_mmu)		@ return address
+	add	r3, r7, lr
+	ldrd	r4, [r3, #0]			@ get secondary_data.pgdir
+	ldr	r8, [r3, #8]			@ get secondary_data.swapper_pg_dir
+	badr	lr, __enable_mmu		@ return address
 	mov	r13, r12			@ __secondary_switched address
 	ldr	r12, [r10, #PROCINFO_INITFUNC]
 	add	r12, r12, r10			@ initialise processor
@@ -397,7 +413,7 @@ ENDPROC(secondary_startup_arm)
 	 * r6  = &secondary_data
 	 */
 ENTRY(__secondary_switched)
-	ldr	sp, [r7, #4]			@ get secondary_data.stack
+	ldr	sp, [r7, #12]			@ get secondary_data.stack
 	mov	fp, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
@@ -416,12 +432,14 @@ __secondary_data:
 /*
  * Setup common bits before finally enabling the MMU.  Essentially
  * this is just loading the page table pointer and domain access
- * registers.
+ * registers.  All these registers need to be preserved by the
+ * processor setup function (or set in the case of r0)
  *
  *  r0  = cp#15 control register
  *  r1  = machine ID
  *  r2  = atags or dtb pointer
- *  r4  = page table (see ARCH_PGD_SHIFT in asm/memory.h)
+ *  r4  = TTBR pointer (low word)
+ *  r5  = TTBR pointer (high word if LPAE)
  *  r9  = processor ID
  *  r13 = *virtual* address to jump to upon completion
  */
@@ -440,7 +458,9 @@ __enable_mmu:
 #ifdef CONFIG_CPU_ICACHE_DISABLE
 	bic	r0, r0, #CR_I
 #endif
-#ifndef CONFIG_ARM_LPAE
+#ifdef CONFIG_ARM_LPAE
+	mcrr	p15, 0, r4, r5, c2		@ load TTBR0
+#else
 	mov	r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
 		      domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
 		      domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c
new file mode 100644
index 000000000000..097e2e201b9f
--- /dev/null
+++ b/arch/arm/kernel/module-plts.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cache.h>
+#include <asm/opcodes.h>
+
+#define PLT_ENT_STRIDE		L1_CACHE_BYTES
+#define PLT_ENT_COUNT		(PLT_ENT_STRIDE / sizeof(u32))
+#define PLT_ENT_SIZE		(sizeof(struct plt_entries) / PLT_ENT_COUNT)
+
+#ifdef CONFIG_THUMB2_KERNEL
+#define PLT_ENT_LDR		__opcode_to_mem_thumb32(0xf8dff000 | \
+							(PLT_ENT_STRIDE - 4))
+#else
+#define PLT_ENT_LDR		__opcode_to_mem_arm(0xe59ff000 | \
+						    (PLT_ENT_STRIDE - 8))
+#endif
+
+struct plt_entries {
+	u32	ldr[PLT_ENT_COUNT];
+	u32	lit[PLT_ENT_COUNT];
+};
+
+static bool in_init(const struct module *mod, u32 addr)
+{
+	return addr - (u32)mod->module_init < mod->init_size;
+}
+
+u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
+{
+	struct plt_entries *plt, *plt_end;
+	int c, *count;
+
+	if (in_init(mod, loc)) {
+		plt = (void *)mod->arch.init_plt->sh_addr;
+		plt_end = (void *)plt + mod->arch.init_plt->sh_size;
+		count = &mod->arch.init_plt_count;
+	} else {
+		plt = (void *)mod->arch.core_plt->sh_addr;
+		plt_end = (void *)plt + mod->arch.core_plt->sh_size;
+		count = &mod->arch.core_plt_count;
+	}
+
+	/* Look for an existing entry pointing to 'val' */
+	for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) {
+		int i;
+
+		if (!c) {
+			/* Populate a new set of entries */
+			*plt = (struct plt_entries){
+				{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
+				{ val, }
+			};
+			++*count;
+			return (u32)plt->ldr;
+		}
+		for (i = 0; i < PLT_ENT_COUNT; i++) {
+			if (!plt->lit[i]) {
+				plt->lit[i] = val;
+				++*count;
+			}
+			if (plt->lit[i] == val)
+				return (u32)&plt->ldr[i];
+		}
+	}
+	BUG();
+}
+
+static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num,
+			   u32 mask)
+{
+	u32 *loc1, *loc2;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		if (rel[i].r_info != rel[num].r_info)
+			continue;
+
+		/*
+		 * Identical relocation types against identical symbols can
+		 * still result in different PLT entries if the addend in the
+		 * place is different. So resolve the target of the relocation
+		 * to compare the values.
+		 */
+		loc1 = (u32 *)(base + rel[i].r_offset);
+		loc2 = (u32 *)(base + rel[num].r_offset);
+		if (((*loc1 ^ *loc2) & mask) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+/* Count how many PLT entries we may need */
+static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
+{
+	unsigned int ret = 0;
+	int i;
+
+	/*
+	 * Sure, this is order(n^2), but it's usually short, and not
+	 * time critical
+	 */
+	for (i = 0; i < num; i++)
+		switch (ELF32_R_TYPE(rel[i].r_info)) {
+		case R_ARM_CALL:
+		case R_ARM_PC24:
+		case R_ARM_JUMP24:
+			if (!duplicate_rel(base, rel, i,
+					   __opcode_to_mem_arm(0x00ffffff)))
+				ret++;
+			break;
+#ifdef CONFIG_THUMB2_KERNEL
+		case R_ARM_THM_CALL:
+		case R_ARM_THM_JUMP24:
+			if (!duplicate_rel(base, rel, i,
+					   __opcode_to_mem_thumb32(0x07ff2fff)))
+				ret++;
+#endif
+		}
+	return ret;
+}
+
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+			      char *secstrings, struct module *mod)
+{
+	unsigned long core_plts = 0, init_plts = 0;
+	Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
+
+	/*
+	 * To store the PLTs, we expand the .text section for core module code
+	 * and the .init.text section for initialization code.
+	 */
+	for (s = sechdrs; s < sechdrs_end; ++s)
+		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
+			mod->arch.core_plt = s;
+		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
+			mod->arch.init_plt = s;
+
+	if (!mod->arch.core_plt || !mod->arch.init_plt) {
+		pr_err("%s: sections missing\n", mod->name);
+		return -ENOEXEC;
+	}
+
+	for (s = sechdrs + 1; s < sechdrs_end; ++s) {
+		const Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
+		int numrels = s->sh_size / sizeof(Elf32_Rel);
+		Elf32_Shdr *dstsec = sechdrs + s->sh_info;
+
+		if (s->sh_type != SHT_REL)
+			continue;
+
+		if (strstr(secstrings + s->sh_name, ".init"))
+			init_plts += count_plts(dstsec->sh_addr, rels, numrels);
+		else
+			core_plts += count_plts(dstsec->sh_addr, rels, numrels);
+	}
+
+	mod->arch.core_plt->sh_type = SHT_NOBITS;
+	mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE,
+					       sizeof(struct plt_entries));
+	mod->arch.core_plt_count = 0;
+
+	mod->arch.init_plt->sh_type = SHT_NOBITS;
+	mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE,
+					       sizeof(struct plt_entries));
+	mod->arch.init_plt_count = 0;
+	pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
+		 mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
+	return 0;
+}
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index af791f4a6205..efdddcb97dd1 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -40,7 +40,12 @@
 #ifdef CONFIG_MMU
 void *module_alloc(unsigned long size)
 {
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+	void *p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
+		return p;
+	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
 				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
@@ -110,6 +115,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 				offset -= 0x04000000;
 
 			offset += sym->st_value - loc;
+
+			/*
+			 * Route through a PLT entry if 'offset' exceeds the
+			 * supported range. Note that 'offset + loc + 8'
+			 * contains the absolute jump target, i.e.,
+			 * @sym + addend, corrected for the +8 PC bias.
+			 */
+			if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS) &&
+			    (offset <= (s32)0xfe000000 ||
+			     offset >= (s32)0x02000000))
+				offset = get_module_plt(module, loc,
+							offset + loc + 8)
+					 - loc - 8;
+
 			if (offset <= (s32)0xfe000000 ||
 			    offset >= (s32)0x02000000) {
 				pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
@@ -203,6 +222,17 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 				offset -= 0x02000000;
 			offset += sym->st_value - loc;
 
+			/*
+			 * Route through a PLT entry if 'offset' exceeds the
+			 * supported range.
+			 */
+			if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS) &&
+			    (offset <= (s32)0xff000000 ||
+			     offset >= (s32)0x01000000))
+				offset = get_module_plt(module, loc,
+							offset + loc + 4)
+					 - loc - 4;
+
 			if (offset <= (s32)0xff000000 ||
 			    offset >= (s32)0x01000000) {
 				pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n",
diff --git a/arch/arm/kernel/module.lds b/arch/arm/kernel/module.lds
new file mode 100644
index 000000000000..3682fa107918
--- /dev/null
+++ b/arch/arm/kernel/module.lds
@@ -0,0 +1,4 @@
+SECTIONS {
+        .core.plt : { BYTE(0) }
+        .init.plt : { BYTE(0) }
+}
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 4a86a0133ac3..357f57ea83f4 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -11,12 +11,18 @@
  */
 #define pr_fmt(fmt) "hw perfevents: " fmt
 
+#include <linux/bitmap.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/irq.h>
 #include <linux/irqdesc.h>
 
+#include <asm/cputype.h>
 #include <asm/irq_regs.h>
 #include <asm/pmu.h>
 
@@ -229,6 +235,10 @@ armpmu_add(struct perf_event *event, int flags)
 	int idx;
 	int err = 0;
 
+	/* An event following a process won't be stopped earlier */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return -ENOENT;
+
 	perf_pmu_disable(event->pmu);
 
 	/* If we don't have a space for the counter then finish early. */
@@ -344,20 +354,12 @@ static void
 armpmu_release_hardware(struct arm_pmu *armpmu)
 {
 	armpmu->free_irq(armpmu);
-	pm_runtime_put_sync(&armpmu->plat_device->dev);
 }
 
 static int
 armpmu_reserve_hardware(struct arm_pmu *armpmu)
 {
-	int err;
-	struct platform_device *pmu_device = armpmu->plat_device;
-
-	if (!pmu_device)
-		return -ENODEV;
-
-	pm_runtime_get_sync(&pmu_device->dev);
-	err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
+	int err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
 	if (err) {
 		armpmu_release_hardware(armpmu);
 		return err;
@@ -454,6 +456,17 @@ static int armpmu_event_init(struct perf_event *event)
 	int err = 0;
 	atomic_t *active_events = &armpmu->active_events;
 
+	/*
+	 * Reject CPU-affine events for CPUs that are of a different class to
+	 * that which this PMU handles. Process-following events (where
+	 * event->cpu == -1) can be migrated between CPUs, and thus we have to
+	 * reject them later (in armpmu_add) if they're scheduled on a
+	 * different class of CPU.
+	 */
+	if (event->cpu != -1 &&
+		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
+		return -ENOENT;
+
 	/* does not support taken branch sampling */
 	if (has_branch_stack(event))
 		return -EOPNOTSUPP;
@@ -489,6 +502,10 @@ static void armpmu_enable(struct pmu *pmu)
 	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
 	int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
 
+	/* For task-bound events we may be called on other CPUs */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return;
+
 	if (enabled)
 		armpmu->start(armpmu);
 }
@@ -496,35 +513,26 @@ static void armpmu_enable(struct pmu *pmu)
 static void armpmu_disable(struct pmu *pmu)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(pmu);
+
+	/* For task-bound events we may be called on other CPUs */
+	if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
+		return;
+
 	armpmu->stop(armpmu);
 }
 
-#ifdef CONFIG_PM
-static int armpmu_runtime_resume(struct device *dev)
+/*
+ * In heterogeneous systems, events are specific to a particular
+ * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
+ * the same microarchitecture.
+ */
+static int armpmu_filter_match(struct perf_event *event)
 {
-	struct arm_pmu_platdata *plat = dev_get_platdata(dev);
-
-	if (plat && plat->runtime_resume)
-		return plat->runtime_resume(dev);
-
-	return 0;
+	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	unsigned int cpu = smp_processor_id();
+	return cpumask_test_cpu(cpu, &armpmu->supported_cpus);
 }
 
-static int armpmu_runtime_suspend(struct device *dev)
-{
-	struct arm_pmu_platdata *plat = dev_get_platdata(dev);
-
-	if (plat && plat->runtime_suspend)
-		return plat->runtime_suspend(dev);
-
-	return 0;
-}
-#endif
-
-const struct dev_pm_ops armpmu_dev_pm_ops = {
-	SET_RUNTIME_PM_OPS(armpmu_runtime_suspend, armpmu_runtime_resume, NULL)
-};
-
 static void armpmu_init(struct arm_pmu *armpmu)
 {
 	atomic_set(&armpmu->active_events, 0);
@@ -539,15 +547,349 @@ static void armpmu_init(struct arm_pmu *armpmu)
 		.start		= armpmu_start,
 		.stop		= armpmu_stop,
 		.read		= armpmu_read,
+		.filter_match	= armpmu_filter_match,
 	};
 }
 
 int armpmu_register(struct arm_pmu *armpmu, int type)
 {
 	armpmu_init(armpmu);
-	pm_runtime_enable(&armpmu->plat_device->dev);
 	pr_info("enabled with %s PMU driver, %d counters available\n",
 			armpmu->name, armpmu->num_events);
 	return perf_pmu_register(&armpmu->pmu, armpmu->name, type);
 }
 
+/* Set at runtime when we know what CPU type we are. */
+static struct arm_pmu *__oprofile_cpu_pmu;
+
+/*
+ * Despite the names, these two functions are CPU-specific and are used
+ * by the OProfile/perf code.
+ */
+const char *perf_pmu_name(void)
+{
+	if (!__oprofile_cpu_pmu)
+		return NULL;
+
+	return __oprofile_cpu_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+	int max_events = 0;
+
+	if (__oprofile_cpu_pmu != NULL)
+		max_events = __oprofile_cpu_pmu->num_events;
+
+	return max_events;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
+static void cpu_pmu_enable_percpu_irq(void *data)
+{
+	int irq = *(int *)data;
+
+	enable_percpu_irq(irq, IRQ_TYPE_NONE);
+}
+
+static void cpu_pmu_disable_percpu_irq(void *data)
+{
+	int irq = *(int *)data;
+
+	disable_percpu_irq(irq);
+}
+
+static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
+{
+	int i, irq, irqs;
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
+
+	irqs = min(pmu_device->num_resources, num_possible_cpus());
+
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq >= 0 && irq_is_percpu(irq)) {
+		on_each_cpu(cpu_pmu_disable_percpu_irq, &irq, 1);
+		free_percpu_irq(irq, &hw_events->percpu_pmu);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			int cpu = i;
+
+			if (cpu_pmu->irq_affinity)
+				cpu = cpu_pmu->irq_affinity[i];
+
+			if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
+				continue;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq >= 0)
+				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+		}
+	}
+}
+
+static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
+{
+	int i, err, irq, irqs;
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
+
+	if (!pmu_device)
+		return -ENODEV;
+
+	irqs = min(pmu_device->num_resources, num_possible_cpus());
+	if (irqs < 1) {
+		pr_warn_once("perf/ARM: No irqs for PMU defined, sampling events not supported\n");
+		return 0;
+	}
+
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq >= 0 && irq_is_percpu(irq)) {
+		err = request_percpu_irq(irq, handler, "arm-pmu",
+					 &hw_events->percpu_pmu);
+		if (err) {
+			pr_err("unable to request IRQ%d for ARM PMU counters\n",
+				irq);
+			return err;
+		}
+		on_each_cpu(cpu_pmu_enable_percpu_irq, &irq, 1);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			int cpu = i;
+
+			err = 0;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq < 0)
+				continue;
+
+			if (cpu_pmu->irq_affinity)
+				cpu = cpu_pmu->irq_affinity[i];
+
+			/*
+			 * If we have a single PMU interrupt that we can't shift,
+			 * assume that we're running on a uniprocessor machine and
+			 * continue. Otherwise, continue without this interrupt.
+			 */
+			if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
+				pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
+					irq, cpu);
+				continue;
+			}
+
+			err = request_irq(irq, handler,
+					  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
+					  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+			if (err) {
+				pr_err("unable to request IRQ%d for ARM PMU counters\n",
+					irq);
+				return err;
+			}
+
+			cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * PMU hardware loses all context when a CPU goes offline.
+ * When a CPU is hotplugged back in, since some hardware registers are
+ * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
+ * junk values out of them.
+ */
+static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	struct arm_pmu *pmu = container_of(b, struct arm_pmu, hotplug_nb);
+
+	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
+		return NOTIFY_DONE;
+
+	if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
+		return NOTIFY_DONE;
+
+	if (pmu->reset)
+		pmu->reset(pmu);
+	else
+		return NOTIFY_DONE;
+
+	return NOTIFY_OK;
+}
+
+static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	int err;
+	int cpu;
+	struct pmu_hw_events __percpu *cpu_hw_events;
+
+	cpu_hw_events = alloc_percpu(struct pmu_hw_events);
+	if (!cpu_hw_events)
+		return -ENOMEM;
+
+	cpu_pmu->hotplug_nb.notifier_call = cpu_pmu_notify;
+	err = register_cpu_notifier(&cpu_pmu->hotplug_nb);
+	if (err)
+		goto out_hw_events;
+
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
+		raw_spin_lock_init(&events->pmu_lock);
+		events->percpu_pmu = cpu_pmu;
+	}
+
+	cpu_pmu->hw_events	= cpu_hw_events;
+	cpu_pmu->request_irq	= cpu_pmu_request_irq;
+	cpu_pmu->free_irq	= cpu_pmu_free_irq;
+
+	/* Ensure the PMU has sane values out of reset. */
+	if (cpu_pmu->reset)
+		on_each_cpu_mask(&cpu_pmu->supported_cpus, cpu_pmu->reset,
+			 cpu_pmu, 1);
+
+	/* If no interrupts available, set the corresponding capability flag */
+	if (!platform_get_irq(cpu_pmu->plat_device, 0))
+		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+	return 0;
+
+out_hw_events:
+	free_percpu(cpu_hw_events);
+	return err;
+}
+
+static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
+{
+	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
+	free_percpu(cpu_pmu->hw_events);
+}
+
+/*
+ * CPU PMU identification and probing.
+ */
+static int probe_current_pmu(struct arm_pmu *pmu,
+			     const struct pmu_probe_info *info)
+{
+	int cpu = get_cpu();
+	unsigned int cpuid = read_cpuid_id();
+	int ret = -ENODEV;
+
+	pr_info("probing PMU on CPU %d\n", cpu);
+
+	for (; info->init != NULL; info++) {
+		if ((cpuid & info->mask) != info->cpuid)
+			continue;
+		ret = info->init(pmu);
+		break;
+	}
+
+	put_cpu();
+	return ret;
+}
+
+static int of_pmu_irq_cfg(struct arm_pmu *pmu)
+{
+	int i, irq, *irqs;
+	struct platform_device *pdev = pmu->plat_device;
+
+	/* Don't bother with PPIs; they're already affine */
+	irq = platform_get_irq(pdev, 0);
+	if (irq >= 0 && irq_is_percpu(irq))
+		return 0;
+
+	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	for (i = 0; i < pdev->num_resources; ++i) {
+		struct device_node *dn;
+		int cpu;
+
+		dn = of_parse_phandle(pdev->dev.of_node, "interrupt-affinity",
+				      i);
+		if (!dn) {
+			pr_warn("Failed to parse %s/interrupt-affinity[%d]\n",
+				of_node_full_name(pdev->dev.of_node), i);
+			break;
+		}
+
+		for_each_possible_cpu(cpu)
+			if (arch_find_n_match_cpu_physical_id(dn, cpu, NULL))
+				break;
+
+		of_node_put(dn);
+		if (cpu >= nr_cpu_ids) {
+			pr_warn("Failed to find logical CPU for %s\n",
+				dn->name);
+			break;
+		}
+
+		irqs[i] = cpu;
+		cpumask_set_cpu(cpu, &pmu->supported_cpus);
+	}
+
+	if (i == pdev->num_resources) {
+		pmu->irq_affinity = irqs;
+	} else {
+		kfree(irqs);
+		cpumask_setall(&pmu->supported_cpus);
+	}
+
+	return 0;
+}
+
+int arm_pmu_device_probe(struct platform_device *pdev,
+			 const struct of_device_id *of_table,
+			 const struct pmu_probe_info *probe_table)
+{
+	const struct of_device_id *of_id;
+	const int (*init_fn)(struct arm_pmu *);
+	struct device_node *node = pdev->dev.of_node;
+	struct arm_pmu *pmu;
+	int ret = -ENODEV;
+
+	pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
+	if (!pmu) {
+		pr_info("failed to allocate PMU device!\n");
+		return -ENOMEM;
+	}
+
+	if (!__oprofile_cpu_pmu)
+		__oprofile_cpu_pmu = pmu;
+
+	pmu->plat_device = pdev;
+
+	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
+		init_fn = of_id->data;
+
+		ret = of_pmu_irq_cfg(pmu);
+		if (!ret)
+			ret = init_fn(pmu);
+	} else {
+		ret = probe_current_pmu(pmu, probe_table);
+		cpumask_setall(&pmu->supported_cpus);
+	}
+
+	if (ret) {
+		pr_info("failed to probe PMU!\n");
+		goto out_free;
+	}
+
+	ret = cpu_pmu_init(pmu);
+	if (ret)
+		goto out_free;
+
+	ret = armpmu_register(pmu, -1);
+	if (ret)
+		goto out_destroy;
+
+	return 0;
+
+out_destroy:
+	cpu_pmu_destroy(pmu);
+out_free:
+	pr_info("failed to register PMU devices!\n");
+	kfree(pmu);
+	return ret;
+}
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
deleted file mode 100644
index 3b8c2833c537..000000000000
--- a/arch/arm/kernel/perf_event_cpu.c
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) 2012 ARM Limited
- *
- * Author: Will Deacon <will.deacon@arm.com>
- */
-#define pr_fmt(fmt) "CPU PMU: " fmt
-
-#include <linux/bitmap.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/irq.h>
-#include <linux/irqdesc.h>
-
-#include <asm/cputype.h>
-#include <asm/irq_regs.h>
-#include <asm/pmu.h>
-
-/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *cpu_pmu;
-
-/*
- * Despite the names, these two functions are CPU-specific and are used
- * by the OProfile/perf code.
- */
-const char *perf_pmu_name(void)
-{
-	if (!cpu_pmu)
-		return NULL;
-
-	return cpu_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-	int max_events = 0;
-
-	if (cpu_pmu != NULL)
-		max_events = cpu_pmu->num_events;
-
-	return max_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
-/* Include the PMU-specific implementations. */
-#include "perf_event_xscale.c"
-#include "perf_event_v6.c"
-#include "perf_event_v7.c"
-
-static void cpu_pmu_enable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	enable_percpu_irq(irq, IRQ_TYPE_NONE);
-}
-
-static void cpu_pmu_disable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	disable_percpu_irq(irq);
-}
-
-static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
-{
-	int i, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
-
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq >= 0 && irq_is_percpu(irq)) {
-		on_each_cpu(cpu_pmu_disable_percpu_irq, &irq, 1);
-		free_percpu_irq(irq, &hw_events->percpu_pmu);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
-				continue;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq >= 0)
-				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
-		}
-	}
-}
-
-static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
-{
-	int i, err, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
-	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
-
-	if (!pmu_device)
-		return -ENODEV;
-
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-	if (irqs < 1) {
-		pr_warn_once("perf/ARM: No irqs for PMU defined, sampling events not supported\n");
-		return 0;
-	}
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq >= 0 && irq_is_percpu(irq)) {
-		err = request_percpu_irq(irq, handler, "arm-pmu",
-					 &hw_events->percpu_pmu);
-		if (err) {
-			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-				irq);
-			return err;
-		}
-		on_each_cpu(cpu_pmu_enable_percpu_irq, &irq, 1);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			err = 0;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq < 0)
-				continue;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			/*
-			 * If we have a single PMU interrupt that we can't shift,
-			 * assume that we're running on a uniprocessor machine and
-			 * continue. Otherwise, continue without this interrupt.
-			 */
-			if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
-				pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
-					irq, cpu);
-				continue;
-			}
-
-			err = request_irq(irq, handler,
-					  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
-					  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
-			if (err) {
-				pr_err("unable to request IRQ%d for ARM PMU counters\n",
-					irq);
-				return err;
-			}
-
-			cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * PMU hardware loses all context when a CPU goes offline.
- * When a CPU is hotplugged back in, since some hardware registers are
- * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
- * junk values out of them.
- */
-static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
-			  void *hcpu)
-{
-	struct arm_pmu *pmu = container_of(b, struct arm_pmu, hotplug_nb);
-
-	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
-		return NOTIFY_DONE;
-
-	if (pmu->reset)
-		pmu->reset(pmu);
-	else
-		return NOTIFY_DONE;
-
-	return NOTIFY_OK;
-}
-
-static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	int err;
-	int cpu;
-	struct pmu_hw_events __percpu *cpu_hw_events;
-
-	cpu_hw_events = alloc_percpu(struct pmu_hw_events);
-	if (!cpu_hw_events)
-		return -ENOMEM;
-
-	cpu_pmu->hotplug_nb.notifier_call = cpu_pmu_notify;
-	err = register_cpu_notifier(&cpu_pmu->hotplug_nb);
-	if (err)
-		goto out_hw_events;
-
-	for_each_possible_cpu(cpu) {
-		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
-		raw_spin_lock_init(&events->pmu_lock);
-		events->percpu_pmu = cpu_pmu;
-	}
-
-	cpu_pmu->hw_events	= cpu_hw_events;
-	cpu_pmu->request_irq	= cpu_pmu_request_irq;
-	cpu_pmu->free_irq	= cpu_pmu_free_irq;
-
-	/* Ensure the PMU has sane values out of reset. */
-	if (cpu_pmu->reset)
-		on_each_cpu(cpu_pmu->reset, cpu_pmu, 1);
-
-	/* If no interrupts available, set the corresponding capability flag */
-	if (!platform_get_irq(cpu_pmu->plat_device, 0))
-		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-	return 0;
-
-out_hw_events:
-	free_percpu(cpu_hw_events);
-	return err;
-}
-
-static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
-{
-	unregister_cpu_notifier(&cpu_pmu->hotplug_nb);
-	free_percpu(cpu_pmu->hw_events);
-}
-
-/*
- * PMU platform driver and devicetree bindings.
- */
-static const struct of_device_id cpu_pmu_of_device_ids[] = {
-	{.compatible = "arm,cortex-a17-pmu",	.data = armv7_a17_pmu_init},
-	{.compatible = "arm,cortex-a15-pmu",	.data = armv7_a15_pmu_init},
-	{.compatible = "arm,cortex-a12-pmu",	.data = armv7_a12_pmu_init},
-	{.compatible = "arm,cortex-a9-pmu",	.data = armv7_a9_pmu_init},
-	{.compatible = "arm,cortex-a8-pmu",	.data = armv7_a8_pmu_init},
-	{.compatible = "arm,cortex-a7-pmu",	.data = armv7_a7_pmu_init},
-	{.compatible = "arm,cortex-a5-pmu",	.data = armv7_a5_pmu_init},
-	{.compatible = "arm,arm11mpcore-pmu",	.data = armv6mpcore_pmu_init},
-	{.compatible = "arm,arm1176-pmu",	.data = armv6_1176_pmu_init},
-	{.compatible = "arm,arm1136-pmu",	.data = armv6_1136_pmu_init},
-	{.compatible = "qcom,krait-pmu",	.data = krait_pmu_init},
-	{.compatible = "qcom,scorpion-pmu",	.data = scorpion_pmu_init},
-	{.compatible = "qcom,scorpion-mp-pmu",	.data = scorpion_mp_pmu_init},
-	{},
-};
-
-static struct platform_device_id cpu_pmu_plat_device_ids[] = {
-	{.name = "arm-pmu"},
-	{.name = "armv6-pmu"},
-	{.name = "armv7-pmu"},
-	{.name = "xscale-pmu"},
-	{},
-};
-
-static const struct pmu_probe_info pmu_probe_table[] = {
-	ARM_PMU_PROBE(ARM_CPU_PART_ARM1136, armv6_1136_pmu_init),
-	ARM_PMU_PROBE(ARM_CPU_PART_ARM1156, armv6_1156_pmu_init),
-	ARM_PMU_PROBE(ARM_CPU_PART_ARM1176, armv6_1176_pmu_init),
-	ARM_PMU_PROBE(ARM_CPU_PART_ARM11MPCORE, armv6mpcore_pmu_init),
-	ARM_PMU_PROBE(ARM_CPU_PART_CORTEX_A8, armv7_a8_pmu_init),
-	ARM_PMU_PROBE(ARM_CPU_PART_CORTEX_A9, armv7_a9_pmu_init),
-	XSCALE_PMU_PROBE(ARM_CPU_XSCALE_ARCH_V1, xscale1pmu_init),
-	XSCALE_PMU_PROBE(ARM_CPU_XSCALE_ARCH_V2, xscale2pmu_init),
-	{ /* sentinel value */ }
-};
-
-/*
- * CPU PMU identification and probing.
- */
-static int probe_current_pmu(struct arm_pmu *pmu)
-{
-	int cpu = get_cpu();
-	unsigned int cpuid = read_cpuid_id();
-	int ret = -ENODEV;
-	const struct pmu_probe_info *info;
-
-	pr_info("probing PMU on CPU %d\n", cpu);
-
-	for (info = pmu_probe_table; info->init != NULL; info++) {
-		if ((cpuid & info->mask) != info->cpuid)
-			continue;
-		ret = info->init(pmu);
-		break;
-	}
-
-	put_cpu();
-	return ret;
-}
-
-static int of_pmu_irq_cfg(struct platform_device *pdev)
-{
-	int i, irq;
-	int *irqs;
-
-	/* Don't bother with PPIs; they're already affine */
-	irq = platform_get_irq(pdev, 0);
-	if (irq >= 0 && irq_is_percpu(irq))
-		return 0;
-
-	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
-	if (!irqs)
-		return -ENOMEM;
-
-	for (i = 0; i < pdev->num_resources; ++i) {
-		struct device_node *dn;
-		int cpu;
-
-		dn = of_parse_phandle(pdev->dev.of_node, "interrupt-affinity",
-				      i);
-		if (!dn) {
-			pr_warn("Failed to parse %s/interrupt-affinity[%d]\n",
-				of_node_full_name(pdev->dev.of_node), i);
-			break;
-		}
-
-		for_each_possible_cpu(cpu)
-			if (arch_find_n_match_cpu_physical_id(dn, cpu, NULL))
-				break;
-
-		of_node_put(dn);
-		if (cpu >= nr_cpu_ids) {
-			pr_warn("Failed to find logical CPU for %s\n",
-				dn->name);
-			break;
-		}
-
-		irqs[i] = cpu;
-	}
-
-	if (i == pdev->num_resources)
-		cpu_pmu->irq_affinity = irqs;
-	else
-		kfree(irqs);
-
-	return 0;
-}
-
-static int cpu_pmu_device_probe(struct platform_device *pdev)
-{
-	const struct of_device_id *of_id;
-	const int (*init_fn)(struct arm_pmu *);
-	struct device_node *node = pdev->dev.of_node;
-	struct arm_pmu *pmu;
-	int ret = -ENODEV;
-
-	if (cpu_pmu) {
-		pr_info("attempt to register multiple PMU devices!\n");
-		return -ENOSPC;
-	}
-
-	pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
-	if (!pmu) {
-		pr_info("failed to allocate PMU device!\n");
-		return -ENOMEM;
-	}
-
-	cpu_pmu = pmu;
-	cpu_pmu->plat_device = pdev;
-
-	if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) {
-		init_fn = of_id->data;
-
-		ret = of_pmu_irq_cfg(pdev);
-		if (!ret)
-			ret = init_fn(pmu);
-	} else {
-		ret = probe_current_pmu(pmu);
-	}
-
-	if (ret) {
-		pr_info("failed to probe PMU!\n");
-		goto out_free;
-	}
-
-	ret = cpu_pmu_init(cpu_pmu);
-	if (ret)
-		goto out_free;
-
-	ret = armpmu_register(cpu_pmu, -1);
-	if (ret)
-		goto out_destroy;
-
-	return 0;
-
-out_destroy:
-	cpu_pmu_destroy(cpu_pmu);
-out_free:
-	pr_info("failed to register PMU devices!\n");
-	kfree(pmu);
-	return ret;
-}
-
-static struct platform_driver cpu_pmu_driver = {
-	.driver		= {
-		.name	= "arm-pmu",
-		.pm	= &armpmu_dev_pm_ops,
-		.of_match_table = cpu_pmu_of_device_ids,
-	},
-	.probe		= cpu_pmu_device_probe,
-	.id_table	= cpu_pmu_plat_device_ids,
-};
-
-static int __init register_pmu_driver(void)
-{
-	return platform_driver_register(&cpu_pmu_driver);
-}
-device_initcall(register_pmu_driver);
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index f2ffd5c542ed..09f83e414a72 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -31,6 +31,14 @@
  */
 
 #if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K)
+
+#include <asm/cputype.h>
+#include <asm/irq_regs.h>
+#include <asm/pmu.h>
+
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
 enum armv6_perf_types {
 	ARMV6_PERFCTR_ICACHE_MISS	    = 0x0,
 	ARMV6_PERFCTR_IBUF_STALL	    = 0x1,
@@ -543,24 +551,39 @@ static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 
 	return 0;
 }
-#else
-static int armv6_1136_pmu_init(struct arm_pmu *cpu_pmu)
+
+static struct of_device_id armv6_pmu_of_device_ids[] = {
+	{.compatible = "arm,arm11mpcore-pmu",	.data = armv6mpcore_pmu_init},
+	{.compatible = "arm,arm1176-pmu",	.data = armv6_1176_pmu_init},
+	{.compatible = "arm,arm1136-pmu",	.data = armv6_1136_pmu_init},
+	{ /* sentinel value */ }
+};
+
+static const struct pmu_probe_info armv6_pmu_probe_table[] = {
+	ARM_PMU_PROBE(ARM_CPU_PART_ARM1136, armv6_1136_pmu_init),
+	ARM_PMU_PROBE(ARM_CPU_PART_ARM1156, armv6_1156_pmu_init),
+	ARM_PMU_PROBE(ARM_CPU_PART_ARM1176, armv6_1176_pmu_init),
+	ARM_PMU_PROBE(ARM_CPU_PART_ARM11MPCORE, armv6mpcore_pmu_init),
+	{ /* sentinel value */ }
+};
+
+static int armv6_pmu_device_probe(struct platform_device *pdev)
 {
-	return -ENODEV;
+	return arm_pmu_device_probe(pdev, armv6_pmu_of_device_ids,
+				    armv6_pmu_probe_table);
 }
 
-static int armv6_1156_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
+static struct platform_driver armv6_pmu_driver = {
+	.driver		= {
+		.name	= "armv6-pmu",
+		.of_match_table = armv6_pmu_of_device_ids,
+	},
+	.probe		= armv6_pmu_device_probe,
+};
 
-static int armv6_1176_pmu_init(struct arm_pmu *cpu_pmu)
+static int __init register_armv6_pmu_driver(void)
 {
-	return -ENODEV;
-}
-
-static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
+	return platform_driver_register(&armv6_pmu_driver);
 }
+device_initcall(register_armv6_pmu_driver);
 #endif	/* CONFIG_CPU_V6 || CONFIG_CPU_V6K */
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index f4207a4dcb01..f9b37f876e20 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -19,9 +19,15 @@
 #ifdef CONFIG_CPU_V7
 
 #include <asm/cp15.h>
+#include <asm/cputype.h>
+#include <asm/irq_regs.h>
+#include <asm/pmu.h>
 #include <asm/vfp.h>
 #include "../vfp/vfpinstr.h"
 
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
 /*
  * Common ARMv7 event types
  *
@@ -1056,15 +1062,22 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->max_period	= (1LLU << 32) - 1;
 };
 
-static u32 armv7_read_num_pmnc_events(void)
+static void armv7_read_num_pmnc_events(void *info)
 {
-	u32 nb_cnt;
+	int *nb_cnt = info;
 
 	/* Read the nb of CNTx counters supported from PMNC */
-	nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT) & ARMV7_PMNC_N_MASK;
+	*nb_cnt = (armv7_pmnc_read() >> ARMV7_PMNC_N_SHIFT) & ARMV7_PMNC_N_MASK;
 
-	/* Add the CPU cycles counter and return */
-	return nb_cnt + 1;
+	/* Add the CPU cycles counter */
+	*nb_cnt += 1;
+}
+
+static int armv7_probe_num_events(struct arm_pmu *arm_pmu)
+{
+	return smp_call_function_any(&arm_pmu->supported_cpus,
+				     armv7_read_num_pmnc_events,
+				     &arm_pmu->num_events, 1);
 }
 
 static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1072,8 +1085,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a8";
 	cpu_pmu->map_event	= armv7_a8_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1081,8 +1093,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a9";
 	cpu_pmu->map_event	= armv7_a9_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1090,8 +1101,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a5";
 	cpu_pmu->map_event	= armv7_a5_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1099,9 +1109,8 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a15";
 	cpu_pmu->map_event	= armv7_a15_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1109,9 +1118,8 @@ static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a7";
 	cpu_pmu->map_event	= armv7_a7_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1119,16 +1127,15 @@ static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_cortex_a12";
 	cpu_pmu->map_event	= armv7_a12_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7_a12_pmu_init(cpu_pmu);
+	int ret = armv7_a12_pmu_init(cpu_pmu);
 	cpu_pmu->name = "armv7_cortex_a17";
-	return 0;
+	return ret;
 }
 
 /*
@@ -1508,14 +1515,13 @@ static int krait_pmu_init(struct arm_pmu *cpu_pmu)
 		cpu_pmu->map_event = krait_map_event_no_branch;
 	else
 		cpu_pmu->map_event = krait_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
 	cpu_pmu->reset		= krait_pmu_reset;
 	cpu_pmu->enable		= krait_pmu_enable_event;
 	cpu_pmu->disable	= krait_pmu_disable_event;
 	cpu_pmu->get_event_idx	= krait_pmu_get_event_idx;
 	cpu_pmu->clear_event_idx = krait_pmu_clear_event_idx;
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 /*
@@ -1833,13 +1839,12 @@ static int scorpion_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_scorpion";
 	cpu_pmu->map_event	= scorpion_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->reset		= scorpion_pmu_reset;
 	cpu_pmu->enable		= scorpion_pmu_enable_event;
 	cpu_pmu->disable	= scorpion_pmu_disable_event;
 	cpu_pmu->get_event_idx	= scorpion_pmu_get_event_idx;
 	cpu_pmu->clear_event_idx = scorpion_pmu_clear_event_idx;
-	return 0;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
 static int scorpion_mp_pmu_init(struct arm_pmu *cpu_pmu)
@@ -1847,62 +1852,52 @@ static int scorpion_mp_pmu_init(struct arm_pmu *cpu_pmu)
 	armv7pmu_init(cpu_pmu);
 	cpu_pmu->name		= "armv7_scorpion_mp";
 	cpu_pmu->map_event	= scorpion_map_event;
-	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->reset		= scorpion_pmu_reset;
 	cpu_pmu->enable		= scorpion_pmu_enable_event;
 	cpu_pmu->disable	= scorpion_pmu_disable_event;
 	cpu_pmu->get_event_idx	= scorpion_pmu_get_event_idx;
 	cpu_pmu->clear_event_idx = scorpion_pmu_clear_event_idx;
-	return 0;
-}
-#else
-static inline int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
+	return armv7_probe_num_events(cpu_pmu);
 }
 
-static inline int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
+static const struct of_device_id armv7_pmu_of_device_ids[] = {
+	{.compatible = "arm,cortex-a17-pmu",	.data = armv7_a17_pmu_init},
+	{.compatible = "arm,cortex-a15-pmu",	.data = armv7_a15_pmu_init},
+	{.compatible = "arm,cortex-a12-pmu",	.data = armv7_a12_pmu_init},
+	{.compatible = "arm,cortex-a9-pmu",	.data = armv7_a9_pmu_init},
+	{.compatible = "arm,cortex-a8-pmu",	.data = armv7_a8_pmu_init},
+	{.compatible = "arm,cortex-a7-pmu",	.data = armv7_a7_pmu_init},
+	{.compatible = "arm,cortex-a5-pmu",	.data = armv7_a5_pmu_init},
+	{.compatible = "qcom,krait-pmu",	.data = krait_pmu_init},
+	{.compatible = "qcom,scorpion-pmu",	.data = scorpion_pmu_init},
+	{.compatible = "qcom,scorpion-mp-pmu",	.data = scorpion_mp_pmu_init},
+	{},
+};
+
+static const struct pmu_probe_info armv7_pmu_probe_table[] = {
+	ARM_PMU_PROBE(ARM_CPU_PART_CORTEX_A8, armv7_a8_pmu_init),
+	ARM_PMU_PROBE(ARM_CPU_PART_CORTEX_A9, armv7_a9_pmu_init),
+	{ /* sentinel value */ }
+};
+
+
+static int armv7_pmu_device_probe(struct platform_device *pdev)
 {
-	return -ENODEV;
+	return arm_pmu_device_probe(pdev, armv7_pmu_of_device_ids,
+				    armv7_pmu_probe_table);
 }
 
-static inline int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
+static struct platform_driver armv7_pmu_driver = {
+	.driver		= {
+		.name	= "armv7-pmu",
+		.of_match_table = armv7_pmu_of_device_ids,
+	},
+	.probe		= armv7_pmu_device_probe,
+};
 
-static inline int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
+static int __init register_armv7_pmu_driver(void)
 {
-	return -ENODEV;
-}
-
-static inline int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
-
-static inline int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
-
-static inline int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
-
-static inline int krait_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
-
-static inline int scorpion_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
-}
-
-static inline int scorpion_mp_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return -ENODEV;
+	return platform_driver_register(&armv7_pmu_driver);
 }
+device_initcall(register_armv7_pmu_driver);
 #endif	/* CONFIG_CPU_V7 */
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 8af9f1f82c68..304d056d5b25 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -13,6 +13,14 @@
  */
 
 #ifdef CONFIG_CPU_XSCALE
+
+#include <asm/cputype.h>
+#include <asm/irq_regs.h>
+#include <asm/pmu.h>
+
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
 enum xscale_perf_types {
 	XSCALE_PERFCTR_ICACHE_MISS		= 0x00,
 	XSCALE_PERFCTR_ICACHE_NO_DELIVER	= 0x01,
@@ -740,14 +748,28 @@ static int xscale2pmu_init(struct arm_pmu *cpu_pmu)
 
 	return 0;
 }
-#else
-static inline int xscale1pmu_init(struct arm_pmu *cpu_pmu)
+
+static const struct pmu_probe_info xscale_pmu_probe_table[] = {
+	XSCALE_PMU_PROBE(ARM_CPU_XSCALE_ARCH_V1, xscale1pmu_init),
+	XSCALE_PMU_PROBE(ARM_CPU_XSCALE_ARCH_V2, xscale2pmu_init),
+	{ /* sentinel value */ }
+};
+
+static int xscale_pmu_device_probe(struct platform_device *pdev)
 {
-	return -ENODEV;
+	return arm_pmu_device_probe(pdev, NULL, xscale_pmu_probe_table);
 }
 
-static inline int xscale2pmu_init(struct arm_pmu *cpu_pmu)
+static struct platform_driver xscale_pmu_driver = {
+	.driver		= {
+		.name	= "xscale-pmu",
+	},
+	.probe		= xscale_pmu_device_probe,
+};
+
+static int __init register_xscale_pmu_driver(void)
 {
-	return -ENODEV;
+	return platform_driver_register(&xscale_pmu_driver);
 }
+device_initcall(register_xscale_pmu_driver);
 #endif	/* CONFIG_CPU_XSCALE */
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6c777e908a24..e6d8c7658ffd 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -75,8 +75,7 @@ __setup("fpe=", fpe_setup);
 
 extern void init_default_cache_policy(unsigned long);
 extern void paging_init(const struct machine_desc *desc);
-extern void early_paging_init(const struct machine_desc *,
-			      struct proc_info_list *);
+extern void early_paging_init(const struct machine_desc *);
 extern void sanity_check_meminfo(void);
 extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(const struct machine_desc *desc);
@@ -93,6 +92,9 @@ unsigned int __atags_pointer __initdata;
 unsigned int system_rev;
 EXPORT_SYMBOL(system_rev);
 
+const char *system_serial;
+EXPORT_SYMBOL(system_serial);
+
 unsigned int system_serial_low;
 EXPORT_SYMBOL(system_serial_low);
 
@@ -839,8 +841,25 @@ arch_initcall(customize_machine);
 
 static int __init init_machine_late(void)
 {
+	struct device_node *root;
+	int ret;
+
 	if (machine_desc->init_late)
 		machine_desc->init_late();
+
+	root = of_find_node_by_path("/");
+	if (root) {
+		ret = of_property_read_string(root, "serial-number",
+					      &system_serial);
+		if (ret)
+			system_serial = NULL;
+	}
+
+	if (!system_serial)
+		system_serial = kasprintf(GFP_KERNEL, "%08x%08x",
+					  system_serial_high,
+					  system_serial_low);
+
 	return 0;
 }
 late_initcall(init_machine_late);
@@ -936,7 +955,9 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
-	early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
+#ifdef CONFIG_MMU
+	early_paging_init(mdesc);
+#endif
 	setup_dma_zone(mdesc);
 	sanity_check_meminfo();
 	arm_memblock_init(mdesc);
@@ -1109,8 +1130,7 @@ static int c_show(struct seq_file *m, void *v)
 
 	seq_printf(m, "Hardware\t: %s\n", machine_name);
 	seq_printf(m, "Revision\t: %04x\n", system_rev);
-	seq_printf(m, "Serial\t\t: %08x%08x\n",
-		   system_serial_high, system_serial_low);
+	seq_printf(m, "Serial\t\t: %s\n", system_serial);
 
 	return 0;
 }
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S
index 6060dbc7844e..0f6c1000582c 100644
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -81,7 +81,7 @@ ENTRY(__cpu_suspend)
 	mov	r1, r4			@ size of save block
 	add	r0, sp, #8		@ pointer to save block
 	bl	__cpu_suspend_save
-	adr	lr, BSYM(cpu_suspend_abort)
+	badr	lr, cpu_suspend_abort
 	ldmfd	sp!, {r0, pc}		@ call suspend fn
 ENDPROC(__cpu_suspend)
 	.ltorg
@@ -122,7 +122,7 @@ ENDPROC(cpu_resume_after_mmu)
 #ifdef CONFIG_MMU
 	.arm
 ENTRY(cpu_resume_arm)
- THUMB(	adr	r9, BSYM(1f)	)	@ Kernel is entered in ARM.
+ THUMB(	badr	r9, 1f		)	@ Kernel is entered in ARM.
  THUMB(	bx	r9		)	@ If this is a Thumb-2 kernel,
  THUMB(	.thumb			)	@ switch to Thumb now.
  THUMB(1:			)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index cca5b8758185..90dfbedfbfb8 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -86,9 +86,11 @@ void __init smp_set_ops(struct smp_operations *ops)
 
 static unsigned long get_arch_pgd(pgd_t *pgd)
 {
-	phys_addr_t pgdir = virt_to_idmap(pgd);
-	BUG_ON(pgdir & ARCH_PGD_MASK);
-	return pgdir >> ARCH_PGD_SHIFT;
+#ifdef CONFIG_ARM_LPAE
+	return __phys_to_pfn(virt_to_phys(pgd));
+#else
+	return virt_to_phys(pgd);
+#endif
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *idle)
@@ -108,7 +110,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 #endif
 
 #ifdef CONFIG_MMU
-	secondary_data.pgdir = get_arch_pgd(idmap_pgd);
+	secondary_data.pgdir = virt_to_phys(idmap_pgd);
 	secondary_data.swapper_pg_dir = get_arch_pgd(swapper_pg_dir);
 #endif
 	sync_cache_w(&secondary_data);
diff --git a/arch/arm/kernel/tcm.c b/arch/arm/kernel/tcm.c
index 7a3be1d4d0b1..b10e1360762e 100644
--- a/arch/arm/kernel/tcm.c
+++ b/arch/arm/kernel/tcm.c
@@ -17,6 +17,9 @@
 #include <asm/mach/map.h>
 #include <asm/memory.h>
 #include <asm/system_info.h>
+#include <asm/traps.h>
+
+#define TCMTR_FORMAT_MASK	0xe0000000U
 
 static struct gen_pool *tcm_pool;
 static bool dtcm_present;
@@ -175,6 +178,77 @@ static int __init setup_tcm_bank(u8 type, u8 bank, u8 banks,
 	return 0;
 }
 
+/*
+ * When we are running in the non-secure world and the secure world
+ * has not explicitly given us access to the TCM we will get an
+ * undefined error when reading the TCM region register in the
+ * setup_tcm_bank function (above).
+ *
+ * There are two variants of this register read that we need to trap,
+ * the read for the data TCM and the read for the instruction TCM:
+ *  c0370628:       ee196f11        mrc     15, 0, r6, cr9, cr1, {0}
+ *  c0370674:       ee196f31        mrc     15, 0, r6, cr9, cr1, {1}
+ *
+ * Our undef hook mask explicitly matches all fields of the encoded
+ * instruction other than the destination register.  The mask also
+ * only allows operand 2 to have the values 0 or 1.
+ *
+ * The undefined hook is defined as __init and __initdata, and therefore
+ * must be removed before tcm_init returns.
+ *
+ * In this particular case (MRC with ARM condition code ALways) the
+ * Thumb-2 and ARM instruction encoding are identical, so this hook
+ * will work on a Thumb-2 kernel.
+ *
+ * See A8.8.107, DDI0406C_C ARM Architecture Reference Manual, Encoding
+ * T1/A1 for the bit-by-bit details.
+ *
+ *  mrc   p15, 0, XX, c9, c1, 0
+ *  mrc   p15, 0, XX, c9, c1, 1
+ *   |  |  |   |   |   |   |  +---- opc2           0|1 = 000|001
+ *   |  |  |   |   |   |   +------- CRm              0 = 0001
+ *   |  |  |   |   |   +----------- CRn              0 = 1001
+ *   |  |  |   |   +--------------- Rt               ? = ????
+ *   |  |  |   +------------------- opc1             0 =  000
+ *   |  |  +----------------------- coproc          15 = 1111
+ *   |  +-------------------------- condition   ALways = 1110
+ *   +----------------------------- instruction    MRC = 1110
+ *
+ * Encoding this as per A8.8.107 of DDI0406C, Encoding T1/A1, yields:
+ *  1111 1111 1111 1111 0000 1111 1101 1111 Required Mask
+ *  1110 1110 0001 1001 ???? 1111 0001 0001 mrc p15, 0, XX, c9, c1, 0
+ *  1110 1110 0001 1001 ???? 1111 0011 0001 mrc p15, 0, XX, c9, c1, 1
+ *  [  ] [  ] [ ]| [  ] [  ] [  ] [ ]| +--- CRm
+ *    |    |   | |   |    |    |   | +----- SBO
+ *    |    |   | |   |    |    |   +------- opc2
+ *    |    |   | |   |    |    +----------- coproc
+ *    |    |   | |   |    +---------------- Rt
+ *    |    |   | |   +--------------------- CRn
+ *    |    |   | +------------------------- SBO
+ *    |    |   +--------------------------- opc1
+ *    |    +------------------------------- instruction
+ *    +------------------------------------ condition
+ */
+#define TCM_REGION_READ_MASK		0xffff0fdf
+#define TCM_REGION_READ_INSTR		0xee190f11
+#define DEST_REG_SHIFT			12
+#define DEST_REG_MASK			0xf
+
+static int __init tcm_handler(struct pt_regs *regs, unsigned int instr)
+{
+	regs->uregs[(instr >> DEST_REG_SHIFT) & DEST_REG_MASK] = 0;
+	regs->ARM_pc += 4;
+	return 0;
+}
+
+static struct undef_hook tcm_hook __initdata = {
+	.instr_mask	= TCM_REGION_READ_MASK,
+	.instr_val	= TCM_REGION_READ_INSTR,
+	.cpsr_mask	= MODE_MASK,
+	.cpsr_val	= SVC_MODE,
+	.fn		= tcm_handler
+};
+
 /*
  * This initializes the TCM memory
  */
@@ -204,9 +278,18 @@ void __init tcm_init(void)
 	}
 
 	tcm_status = read_cpuid_tcmstatus();
+
+	/*
+	 * This code only supports v6-compatible TCMTR implementations.
+	 */
+	if (tcm_status & TCMTR_FORMAT_MASK)
+		return;
+
 	dtcm_banks = (tcm_status >> 16) & 0x03;
 	itcm_banks = (tcm_status & 0x03);
 
+	register_undef_hook(&tcm_hook);
+
 	/* Values greater than 2 for D/ITCM banks are "reserved" */
 	if (dtcm_banks > 2)
 		dtcm_banks = 0;
@@ -218,7 +301,7 @@ void __init tcm_init(void)
 		for (i = 0; i < dtcm_banks; i++) {
 			ret = setup_tcm_bank(0, i, dtcm_banks, &dtcm_end);
 			if (ret)
-				return;
+				goto unregister;
 		}
 		/* This means you compiled more code than fits into DTCM */
 		if (dtcm_code_sz > (dtcm_end - DTCM_OFFSET)) {
@@ -227,6 +310,12 @@ void __init tcm_init(void)
 				dtcm_code_sz, (dtcm_end - DTCM_OFFSET));
 			goto no_dtcm;
 		}
+		/*
+		 * This means that the DTCM sizes were 0 or the DTCM banks
+		 * were inaccessible due to TrustZone configuration.
+		 */
+		if (!(dtcm_end - DTCM_OFFSET))
+			goto no_dtcm;
 		dtcm_res.end = dtcm_end - 1;
 		request_resource(&iomem_resource, &dtcm_res);
 		dtcm_iomap[0].length = dtcm_end - DTCM_OFFSET;
@@ -250,15 +339,21 @@ no_dtcm:
 		for (i = 0; i < itcm_banks; i++) {
 			ret = setup_tcm_bank(1, i, itcm_banks, &itcm_end);
 			if (ret)
-				return;
+				goto unregister;
 		}
 		/* This means you compiled more code than fits into ITCM */
 		if (itcm_code_sz > (itcm_end - ITCM_OFFSET)) {
 			pr_info("CPU ITCM: %u bytes of code compiled to "
 				"ITCM but only %lu bytes of ITCM present\n",
 				itcm_code_sz, (itcm_end - ITCM_OFFSET));
-			return;
+			goto unregister;
 		}
+		/*
+		 * This means that the ITCM sizes were 0 or the ITCM banks
+		 * were inaccessible due to TrustZone configuration.
+		 */
+		if (!(itcm_end - ITCM_OFFSET))
+			goto unregister;
 		itcm_res.end = itcm_end - 1;
 		request_resource(&iomem_resource, &itcm_res);
 		itcm_iomap[0].length = itcm_end - ITCM_OFFSET;
@@ -275,6 +370,9 @@ no_dtcm:
 		pr_info("CPU ITCM: %u bytes of code compiled to ITCM but no "
 			"ITCM banks present in CPU\n", itcm_code_sz);
 	}
+
+unregister:
+	unregister_undef_hook(&tcm_hook);
 }
 
 /*
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 3dce1a342030..d358226236f2 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -749,14 +749,6 @@ late_initcall(arm_mrc_hook_init);
 
 #endif
 
-void __bad_xchg(volatile void *ptr, int size)
-{
-	pr_err("xchg: bad data size: pc 0x%p, ptr 0x%p, size %d\n",
-	       __builtin_return_address(0), ptr, size);
-	BUG();
-}
-EXPORT_SYMBOL(__bad_xchg);
-
 /*
  * A data abort trap was taken, but we did not handle the instruction.
  * Try to abort the user program, or panic if it was the kernel.
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index f7db3a5d80e3..568494dbbbb5 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -307,7 +307,7 @@ ENTRY(kvm_call_hyp)
 THUMB(	orr	r2, r2, #PSR_T_BIT	)
 	msr	spsr_cxsf, r2
 	mrs	r1, ELR_hyp
-	ldr	r2, =BSYM(panic)
+	ldr	r2, =panic
 	msr	ELR_hyp, r2
 	ldr	r0, =\panic_str
 	clrex				@ Clear exclusive monitor
diff --git a/arch/arm/lib/call_with_stack.S b/arch/arm/lib/call_with_stack.S
index ed1a421813cb..bf3a40889205 100644
--- a/arch/arm/lib/call_with_stack.S
+++ b/arch/arm/lib/call_with_stack.S
@@ -35,7 +35,7 @@ ENTRY(call_with_stack)
 	mov	r2, r0
 	mov	r0, r1
 
-	adr	lr, BSYM(1f)
+	badr	lr, 1f
 	ret	r2
 
 1:	ldr	lr, [sp]
diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
index 96866d03d281..f572219c7a40 100644
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -311,13 +311,7 @@ static int exynos5420_cpu_suspend(unsigned long arg)
 
 	if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM)) {
 		mcpm_set_entry_vector(cpu, cluster, exynos_cpu_resume);
-
-		/*
-		 * Residency value passed to mcpm_cpu_suspend back-end
-		 * has to be given clear semantics. Set to 0 as a
-		 * temporary value.
-		 */
-		mcpm_cpu_suspend(0);
+		mcpm_cpu_suspend();
 	}
 
 	pr_info("Failed to suspend the system\n");
diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
index 280f3f14f77c..b5f8f5ffda79 100644
--- a/arch/arm/mach-hisi/platmcpm.c
+++ b/arch/arm/mach-hisi/platmcpm.c
@@ -6,6 +6,8 @@
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
  */
+#include <linux/init.h>
+#include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
@@ -13,7 +15,9 @@
 
 #include <asm/cputype.h>
 #include <asm/cp15.h>
-#include <asm/mcpm.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <asm/smp_plat.h>
 
 #include "core.h"
 
@@ -94,11 +98,16 @@ static void hip04_set_snoop_filter(unsigned int cluster, unsigned int on)
 	} while (data != readl_relaxed(fabric + FAB_SF_MODE));
 }
 
-static int hip04_mcpm_power_up(unsigned int cpu, unsigned int cluster)
+static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 {
+	unsigned int mpidr, cpu, cluster;
 	unsigned long data;
 	void __iomem *sys_dreq, *sys_status;
 
+	mpidr = cpu_logical_map(l_cpu);
+	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+
 	if (!sysctrl)
 		return -ENODEV;
 	if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
@@ -118,6 +127,7 @@ static int hip04_mcpm_power_up(unsigned int cpu, unsigned int cluster)
 			cpu_relax();
 			data = readl_relaxed(sys_status);
 		} while (data & CLUSTER_DEBUG_RESET_STATUS);
+		hip04_set_snoop_filter(cluster, 1);
 	}
 
 	data = CORE_RESET_BIT(cpu) | NEON_RESET_BIT(cpu) | \
@@ -126,11 +136,15 @@ static int hip04_mcpm_power_up(unsigned int cpu, unsigned int cluster)
 	do {
 		cpu_relax();
 	} while (data == readl_relaxed(sys_status));
+
 	/*
 	 * We may fail to power up core again without this delay.
 	 * It's not mentioned in document. It's found by test.
 	 */
 	udelay(20);
+
+	arch_send_wakeup_ipi_mask(cpumask_of(l_cpu));
+
 out:
 	hip04_cpu_table[cluster][cpu]++;
 	spin_unlock_irq(&boot_lock);
@@ -138,31 +152,30 @@ out:
 	return 0;
 }
 
-static void hip04_mcpm_power_down(void)
+#ifdef CONFIG_HOTPLUG_CPU
+static void hip04_cpu_die(unsigned int l_cpu)
 {
 	unsigned int mpidr, cpu, cluster;
-	bool skip_wfi = false, last_man = false;
+	bool last_man;
 
-	mpidr = read_cpuid_mpidr();
+	mpidr = cpu_logical_map(l_cpu);
 	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 
-	__mcpm_cpu_going_down(cpu, cluster);
-
 	spin_lock(&boot_lock);
-	BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
 	hip04_cpu_table[cluster][cpu]--;
 	if (hip04_cpu_table[cluster][cpu] == 1) {
 		/* A power_up request went ahead of us. */
-		skip_wfi = true;
+		spin_unlock(&boot_lock);
+		return;
 	} else if (hip04_cpu_table[cluster][cpu] > 1) {
 		pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 		BUG();
 	}
 
 	last_man = hip04_cluster_is_down(cluster);
-	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
-		spin_unlock(&boot_lock);
+	spin_unlock(&boot_lock);
+	if (last_man) {
 		/* Since it's Cortex A15, disable L2 prefetching. */
 		asm volatile(
 		"mcr	p15, 1, %0, c15, c0, 3 \n\t"
@@ -170,34 +183,30 @@ static void hip04_mcpm_power_down(void)
 		"dsb	"
 		: : "r" (0x400) );
 		v7_exit_coherency_flush(all);
-		hip04_set_snoop_filter(cluster, 0);
-		__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
 	} else {
-		spin_unlock(&boot_lock);
 		v7_exit_coherency_flush(louis);
 	}
 
-	__mcpm_cpu_down(cpu, cluster);
-
-	if (!skip_wfi)
+	for (;;)
 		wfi();
 }
 
-static int hip04_mcpm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
+static int hip04_cpu_kill(unsigned int l_cpu)
 {
+	unsigned int mpidr, cpu, cluster;
 	unsigned int data, tries, count;
-	int ret = -ETIMEDOUT;
 
+	mpidr = cpu_logical_map(l_cpu);
+	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 	BUG_ON(cluster >= HIP04_MAX_CLUSTERS ||
 	       cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 
 	count = TIMEOUT_MSEC / POLL_MSEC;
 	spin_lock_irq(&boot_lock);
 	for (tries = 0; tries < count; tries++) {
-		if (hip04_cpu_table[cluster][cpu]) {
-			ret = -EBUSY;
+		if (hip04_cpu_table[cluster][cpu])
 			goto err;
-		}
 		cpu_relax();
 		data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 		if (data & CORE_WFI_STATUS(cpu))
@@ -220,64 +229,22 @@ static int hip04_mcpm_wait_for_powerdown(unsigned int cpu, unsigned int cluster)
 	}
 	if (tries >= count)
 		goto err;
+	if (hip04_cluster_is_down(cluster))
+		hip04_set_snoop_filter(cluster, 0);
 	spin_unlock_irq(&boot_lock);
-	return 0;
+	return 1;
 err:
 	spin_unlock_irq(&boot_lock);
-	return ret;
+	return 0;
 }
+#endif
 
-static void hip04_mcpm_powered_up(void)
-{
-	unsigned int mpidr, cpu, cluster;
-
-	mpidr = read_cpuid_mpidr();
-	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
-
-	spin_lock(&boot_lock);
-	if (!hip04_cpu_table[cluster][cpu])
-		hip04_cpu_table[cluster][cpu] = 1;
-	spin_unlock(&boot_lock);
-}
-
-static void __naked hip04_mcpm_power_up_setup(unsigned int affinity_level)
-{
-	asm volatile ("			\n"
-"	cmp	r0, #0			\n"
-"	bxeq	lr			\n"
-	/* calculate fabric phys address */
-"	adr	r2, 2f			\n"
-"	ldmia	r2, {r1, r3}		\n"
-"	sub	r0, r2, r1		\n"
-"	ldr	r2, [r0, r3]		\n"
-	/* get cluster id from MPIDR */
-"	mrc	p15, 0, r0, c0, c0, 5	\n"
-"	ubfx	r1, r0, #8, #8		\n"
-	/* 1 << cluster id */
-"	mov	r0, #1			\n"
-"	mov	r3, r0, lsl r1		\n"
-"	ldr	r0, [r2, #"__stringify(FAB_SF_MODE)"]	\n"
-"	tst	r0, r3			\n"
-"	bxne	lr			\n"
-"	orr	r1, r0, r3		\n"
-"	str	r1, [r2, #"__stringify(FAB_SF_MODE)"]	\n"
-"1:	ldr	r0, [r2, #"__stringify(FAB_SF_MODE)"]	\n"
-"	tst	r0, r3			\n"
-"	beq	1b			\n"
-"	bx	lr			\n"
-
-"	.align	2			\n"
-"2:	.word	.			\n"
-"	.word	fabric_phys_addr	\n"
-	);
-}
-
-static const struct mcpm_platform_ops hip04_mcpm_ops = {
-	.power_up		= hip04_mcpm_power_up,
-	.power_down		= hip04_mcpm_power_down,
-	.wait_for_powerdown	= hip04_mcpm_wait_for_powerdown,
-	.powered_up		= hip04_mcpm_powered_up,
+static struct smp_operations __initdata hip04_smp_ops = {
+	.smp_boot_secondary	= hip04_boot_secondary,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_die		= hip04_cpu_die,
+	.cpu_kill		= hip04_cpu_kill,
+#endif
 };
 
 static bool __init hip04_cpu_table_init(void)
@@ -298,7 +265,7 @@ static bool __init hip04_cpu_table_init(void)
 	return true;
 }
 
-static int __init hip04_mcpm_init(void)
+static int __init hip04_smp_init(void)
 {
 	struct device_node *np, *np_sctl, *np_fab;
 	struct resource fab_res;
@@ -353,10 +320,6 @@ static int __init hip04_mcpm_init(void)
 		ret = -EINVAL;
 		goto err_table;
 	}
-	ret = mcpm_platform_register(&hip04_mcpm_ops);
-	if (ret) {
-		goto err_table;
-	}
 
 	/*
 	 * Fill the instruction address that is used after secondary core
@@ -364,13 +327,11 @@ static int __init hip04_mcpm_init(void)
 	 */
 	writel_relaxed(hip04_boot_method[0], relocation);
 	writel_relaxed(0xa5a5a5a5, relocation + 4);	/* magic number */
-	writel_relaxed(virt_to_phys(mcpm_entry_point), relocation + 8);
+	writel_relaxed(virt_to_phys(secondary_startup), relocation + 8);
 	writel_relaxed(0, relocation + 12);
 	iounmap(relocation);
 
-	mcpm_sync_init(hip04_mcpm_power_up_setup);
-	mcpm_smp_set_ops();
-	pr_info("HiP04 MCPM initialized\n");
+	smp_set_ops(&hip04_smp_ops);
 	return ret;
 err_table:
 	iounmap(fabric);
@@ -383,4 +344,4 @@ err_reloc:
 err:
 	return ret;
 }
-early_initcall(hip04_mcpm_init);
+early_initcall(hip04_smp_init);
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index 30003ba447a5..5b0e363fe5ba 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -37,7 +37,6 @@
 #include <linux/stat.h>
 #include <linux/termios.h>
 
-#include <asm/hardware/arm_timer.h>
 #include <asm/setup.h>
 #include <asm/param.h>		/* HZ */
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c
index 06620875813a..e288010522f9 100644
--- a/arch/arm/mach-keystone/keystone.c
+++ b/arch/arm/mach-keystone/keystone.c
@@ -27,7 +27,6 @@
 
 #include "keystone.h"
 
-static struct notifier_block platform_nb;
 static unsigned long keystone_dma_pfn_offset __read_mostly;
 
 static int keystone_platform_notifier(struct notifier_block *nb,
@@ -49,11 +48,18 @@ static int keystone_platform_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static struct notifier_block platform_nb = {
+	.notifier_call = keystone_platform_notifier,
+};
+
 static void __init keystone_init(void)
 {
-	keystone_pm_runtime_init();
-	if (platform_nb.notifier_call)
+	if (PHYS_OFFSET >= KEYSTONE_HIGH_PHYS_START) {
+		keystone_dma_pfn_offset = PFN_DOWN(KEYSTONE_HIGH_PHYS_START -
+						   KEYSTONE_LOW_PHYS_START);
 		bus_register_notifier(&platform_bus_type, &platform_nb);
+	}
+	keystone_pm_runtime_init();
 	of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL);
 }
 
@@ -62,11 +68,9 @@ static phys_addr_t keystone_virt_to_idmap(unsigned long x)
 	return (phys_addr_t)(x) - CONFIG_PAGE_OFFSET + KEYSTONE_LOW_PHYS_START;
 }
 
-static void __init keystone_init_meminfo(void)
+static long long __init keystone_pv_fixup(void)
 {
-	bool lpae = IS_ENABLED(CONFIG_ARM_LPAE);
-	bool pvpatch = IS_ENABLED(CONFIG_ARM_PATCH_PHYS_VIRT);
-	phys_addr_t offset = PHYS_OFFSET - KEYSTONE_LOW_PHYS_START;
+	long long offset;
 	phys_addr_t mem_start, mem_end;
 
 	mem_start = memblock_start_of_DRAM();
@@ -75,32 +79,21 @@ static void __init keystone_init_meminfo(void)
 	/* nothing to do if we are running out of the <32-bit space */
 	if (mem_start >= KEYSTONE_LOW_PHYS_START &&
 	    mem_end   <= KEYSTONE_LOW_PHYS_END)
-		return;
-
-	if (!lpae || !pvpatch) {
-		pr_crit("Enable %s%s%s to run outside 32-bit space\n",
-		      !lpae ? __stringify(CONFIG_ARM_LPAE) : "",
-		      (!lpae && !pvpatch) ? " and " : "",
-		      !pvpatch ? __stringify(CONFIG_ARM_PATCH_PHYS_VIRT) : "");
-	}
+		return 0;
 
 	if (mem_start < KEYSTONE_HIGH_PHYS_START ||
 	    mem_end   > KEYSTONE_HIGH_PHYS_END) {
 		pr_crit("Invalid address space for memory (%08llx-%08llx)\n",
-		      (u64)mem_start, (u64)mem_end);
+		        (u64)mem_start, (u64)mem_end);
+		return 0;
 	}
 
-	offset += KEYSTONE_HIGH_PHYS_START;
-	__pv_phys_pfn_offset = PFN_DOWN(offset);
-	__pv_offset = (offset - PAGE_OFFSET);
+	offset = KEYSTONE_HIGH_PHYS_START - KEYSTONE_LOW_PHYS_START;
 
 	/* Populate the arch idmap hook */
 	arch_virt_to_idmap = keystone_virt_to_idmap;
-	platform_nb.notifier_call = keystone_platform_notifier;
-	keystone_dma_pfn_offset = PFN_DOWN(KEYSTONE_HIGH_PHYS_START -
-						KEYSTONE_LOW_PHYS_START);
 
-	pr_info("Switching to high address space at 0x%llx\n", (u64)offset);
+	return offset;
 }
 
 static const char *const keystone_match[] __initconst = {
@@ -115,5 +108,5 @@ DT_MACHINE_START(KEYSTONE, "Keystone")
 	.smp		= smp_ops(keystone_smp_ops),
 	.init_machine	= keystone_init,
 	.dt_compat	= keystone_match,
-	.init_meminfo   = keystone_init_meminfo,
+	.pv_fixup	= keystone_pv_fixup,
 MACHINE_END
diff --git a/arch/arm/mach-keystone/platsmp.c b/arch/arm/mach-keystone/platsmp.c
index 5f46a7cf907b..4bbb18463bfd 100644
--- a/arch/arm/mach-keystone/platsmp.c
+++ b/arch/arm/mach-keystone/platsmp.c
@@ -39,19 +39,6 @@ static int keystone_smp_boot_secondary(unsigned int cpu,
 	return error;
 }
 
-#ifdef CONFIG_ARM_LPAE
-static void __cpuinit keystone_smp_secondary_initmem(unsigned int cpu)
-{
-	pgd_t *pgd0 = pgd_offset_k(0);
-	cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
-	local_flush_tlb_all();
-}
-#else
-static inline void __cpuinit keystone_smp_secondary_initmem(unsigned int cpu)
-{}
-#endif
-
 struct smp_operations keystone_smp_ops __initdata = {
 	.smp_boot_secondary	= keystone_smp_boot_secondary,
-	.smp_secondary_init     = keystone_smp_secondary_initmem,
 };
diff --git a/arch/arm/mach-nspire/nspire.c b/arch/arm/mach-nspire/nspire.c
index 3445a5686805..34c2a1b32e7d 100644
--- a/arch/arm/mach-nspire/nspire.c
+++ b/arch/arm/mach-nspire/nspire.c
@@ -22,8 +22,6 @@
 #include <asm/mach-types.h>
 #include <asm/mach/map.h>
 
-#include <asm/hardware/timer-sp.h>
-
 #include "mmio.h"
 #include "clcd.h"
 
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index c309593abdb2..44575edc44b1 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -35,20 +35,19 @@
 #include <linux/mtd/physmap.h>
 #include <linux/memblock.h>
 
+#include <clocksource/timer-sp804.h>
+
 #include <mach/hardware.h>
 #include <asm/irq.h>
 #include <asm/mach-types.h>
-#include <asm/hardware/arm_timer.h>
 #include <asm/hardware/icst.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
 #include <asm/mach/map.h>
 
-
 #include <mach/platform.h>
 #include <mach/irqs.h>
-#include <asm/hardware/timer-sp.h>
 
 #include <plat/sched_clock.h>
 
@@ -381,10 +380,10 @@ void __init realview_timer_init(unsigned int timer_irq)
 	/*
 	 * Initialise to a known state (all timers off)
 	 */
-	writel(0, timer0_va_base + TIMER_CTRL);
-	writel(0, timer1_va_base + TIMER_CTRL);
-	writel(0, timer2_va_base + TIMER_CTRL);
-	writel(0, timer3_va_base + TIMER_CTRL);
+	sp804_timer_disable(timer0_va_base);
+	sp804_timer_disable(timer1_va_base);
+	sp804_timer_disable(timer2_va_base);
+	sp804_timer_disable(timer3_va_base);
 
 	sp804_clocksource_init(timer3_va_base, "timer3");
 	sp804_clockevents_init(timer0_va_base, timer_irq, "timer0");
diff --git a/arch/arm/mach-sa1100/Makefile b/arch/arm/mach-sa1100/Makefile
index 61ff91e76e0a..ebc4d58e1a32 100644
--- a/arch/arm/mach-sa1100/Makefile
+++ b/arch/arm/mach-sa1100/Makefile
@@ -3,7 +3,7 @@
 #
 
 # Common support
-obj-y := clock.o generic.o irq.o #nmi-oopser.o
+obj-y := clock.o generic.o #nmi-oopser.o
 
 # Specific board support
 obj-$(CONFIG_SA1100_ASSABET)		+= assabet.o
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index 40e0d8619a2d..345e63f4eb71 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -20,9 +20,12 @@
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
 #include <linux/reboot.h>
+#include <linux/irqchip/irq-sa11x0.h>
 
 #include <video/sa1100fb.h>
 
+#include <soc/sa1100/pwer.h>
+
 #include <asm/div64.h>
 #include <asm/mach/map.h>
 #include <asm/mach/flash.h>
@@ -375,6 +378,18 @@ void __init sa1100_timer_init(void)
 	pxa_timer_nodt_init(IRQ_OST0, io_p2v(0x90000000), 3686400);
 }
 
+static struct resource irq_resource =
+	DEFINE_RES_MEM_NAMED(0x90050000, SZ_64K, "irqs");
+
+void __init sa1100_init_irq(void)
+{
+	request_resource(&iomem_resource, &irq_resource);
+
+	sa11x0_init_irq_nodt(IRQ_GPIO0_SC, irq_resource.start);
+
+	sa1100_init_gpio();
+}
+
 /*
  * Disable the memory bus request/grant signals on the SA1110 to
  * ensure that we don't receive spurious memory requests.  We set
@@ -416,3 +431,25 @@ void sa1110_mb_enable(void)
 	local_irq_restore(flags);
 }
 
+int sa11x0_gpio_set_wake(unsigned int gpio, unsigned int on)
+{
+	if (on)
+		PWER |= BIT(gpio);
+	else
+		PWER &= ~BIT(gpio);
+
+	return 0;
+}
+
+int sa11x0_sc_set_wake(unsigned int irq, unsigned int on)
+{
+	if (BIT(irq) != IC_RTCAlrm)
+		return -EINVAL;
+
+	if (on)
+		PWER |= PWER_RTC;
+	else
+		PWER &= ~PWER_RTC;
+
+	return 0;
+}
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 6ea09fe53426..23a04fe5d2ad 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -41,8 +41,9 @@
 #include <linux/bitops.h>
 #include <linux/reboot.h>
 
+#include <clocksource/timer-sp804.h>
+
 #include <asm/irq.h>
-#include <asm/hardware/arm_timer.h>
 #include <asm/hardware/icst.h>
 #include <asm/mach-types.h>
 
@@ -52,7 +53,6 @@
 #include <asm/mach/map.h>
 #include <mach/hardware.h>
 #include <mach/platform.h>
-#include <asm/hardware/timer-sp.h>
 
 #include <plat/sched_clock.h>
 
@@ -798,10 +798,10 @@ void __init versatile_timer_init(void)
 	/*
 	 * Initialise to a known state (all timers off)
 	 */
-	writel(0, TIMER0_VA_BASE + TIMER_CTRL);
-	writel(0, TIMER1_VA_BASE + TIMER_CTRL);
-	writel(0, TIMER2_VA_BASE + TIMER_CTRL);
-	writel(0, TIMER3_VA_BASE + TIMER_CTRL);
+	sp804_timer_disable(TIMER0_VA_BASE);
+	sp804_timer_disable(TIMER1_VA_BASE);
+	sp804_timer_disable(TIMER2_VA_BASE);
+	sp804_timer_disable(TIMER3_VA_BASE);
 
 	sp804_clocksource_init(TIMER3_VA_BASE, "timer3");
 	sp804_clockevents_init(TIMER0_VA_BASE, IRQ_TIMERINT0_1, "timer0");
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index b4f92b9a13ac..7c6b976ab8d3 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -6,7 +6,7 @@ comment "Processor Type"
 
 # ARM7TDMI
 config CPU_ARM7TDMI
-	bool "Support ARM7TDMI processor"
+	bool
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_LV4T
@@ -56,7 +56,7 @@ config CPU_ARM740T
 
 # ARM9TDMI
 config CPU_ARM9TDMI
-	bool "Support ARM9TDMI processor"
+	bool
 	depends on !MMU
 	select CPU_32v4T
 	select CPU_ABRT_NOMMU
@@ -604,6 +604,22 @@ config CPU_USE_DOMAINS
 	  This option enables or disables the use of domain switching
 	  via the set_fs() function.
 
+config CPU_V7M_NUM_IRQ
+	int "Number of external interrupts connected to the NVIC"
+	depends on CPU_V7M
+	default 90 if ARCH_STM32
+	default 38 if ARCH_EFM32
+	default 112 if SOC_VF610
+	default 240
+	help
+	  This option indicates the number of interrupts connected to the NVIC.
+	  The value can be larger than the real number of interrupts supported
+	  by the system, but must not be lower.
+	  The default value is 240, corresponding to the maximum number of
+	  interrupts supported by the NVIC on Cortex-M family.
+
+	  If unsure, keep default value.
+
 #
 # CPU supports 36-bit I/O
 #
@@ -624,6 +640,10 @@ config ARM_LPAE
 
 	  If unsure, say N.
 
+config ARM_PV_FIXUP
+	def_bool y
+	depends on ARM_LPAE && ARM_PATCH_PHYS_VIRT && ARCH_KEYSTONE
+
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool ARM_LPAE
 
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index d3afdf9eb65a..57c8df500e8c 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_MODULES)		+= proc-syms.o
 obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_ARM_PV_FIXUP)	+= pv-fixup-asm.o
 
 obj-$(CONFIG_CPU_ABRT_NOMMU)	+= abort-nommu.o
 obj-$(CONFIG_CPU_ABRT_EV4)	+= abort-ev4.o
@@ -55,6 +56,8 @@ obj-$(CONFIG_CPU_XSCALE)	+= copypage-xscale.o
 obj-$(CONFIG_CPU_XSC3)		+= copypage-xsc3.o
 obj-$(CONFIG_CPU_COPY_FA)	+= copypage-fa.o
 
+CFLAGS_copypage-feroceon.o := -march=armv5te
+
 obj-$(CONFIG_CPU_TLB_V4WT)	+= tlb-v4.o
 obj-$(CONFIG_CPU_TLB_V4WB)	+= tlb-v4wb.o
 obj-$(CONFIG_CPU_TLB_V4WBI)	+= tlb-v4wbi.o
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index e309c8f35af5..71b3d3309024 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -38,10 +38,11 @@ struct l2c_init_data {
 	unsigned way_size_0;
 	unsigned num_lock;
 	void (*of_parse)(const struct device_node *, u32 *, u32 *);
-	void (*enable)(void __iomem *, u32, unsigned);
+	void (*enable)(void __iomem *, unsigned);
 	void (*fixup)(void __iomem *, u32, struct outer_cache_fns *);
 	void (*save)(void __iomem *);
 	void (*configure)(void __iomem *);
+	void (*unlock)(void __iomem *, unsigned);
 	struct outer_cache_fns outer_cache;
 };
 
@@ -110,14 +111,6 @@ static inline void l2c_unlock(void __iomem *base, unsigned num)
 
 static void l2c_configure(void __iomem *base)
 {
-	if (outer_cache.configure) {
-		outer_cache.configure(&l2x0_saved_regs);
-		return;
-	}
-
-	if (l2x0_data->configure)
-		l2x0_data->configure(base);
-
 	l2c_write_sec(l2x0_saved_regs.aux_ctrl, base, L2X0_AUX_CTRL);
 }
 
@@ -125,18 +118,16 @@ static void l2c_configure(void __iomem *base)
  * Enable the L2 cache controller.  This function must only be
  * called when the cache controller is known to be disabled.
  */
-static void l2c_enable(void __iomem *base, u32 aux, unsigned num_lock)
+static void l2c_enable(void __iomem *base, unsigned num_lock)
 {
 	unsigned long flags;
 
-	/* Do not touch the controller if already enabled. */
-	if (readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)
-		return;
+	if (outer_cache.configure)
+		outer_cache.configure(&l2x0_saved_regs);
+	else
+		l2x0_data->configure(base);
 
-	l2x0_saved_regs.aux_ctrl = aux;
-	l2c_configure(base);
-
-	l2c_unlock(base, num_lock);
+	l2x0_data->unlock(base, num_lock);
 
 	local_irq_save(flags);
 	__l2c_op_way(base + L2X0_INV_WAY);
@@ -163,7 +154,11 @@ static void l2c_save(void __iomem *base)
 
 static void l2c_resume(void)
 {
-	l2c_enable(l2x0_base, l2x0_saved_regs.aux_ctrl, l2x0_data->num_lock);
+	void __iomem *base = l2x0_base;
+
+	/* Do not touch the controller if already enabled. */
+	if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN))
+		l2c_enable(base, l2x0_data->num_lock);
 }
 
 /*
@@ -252,6 +247,8 @@ static const struct l2c_init_data l2c210_data __initconst = {
 	.num_lock = 1,
 	.enable = l2c_enable,
 	.save = l2c_save,
+	.configure = l2c_configure,
+	.unlock = l2c_unlock,
 	.outer_cache = {
 		.inv_range = l2c210_inv_range,
 		.clean_range = l2c210_clean_range,
@@ -391,16 +388,22 @@ static void l2c220_sync(void)
 	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
 }
 
-static void l2c220_enable(void __iomem *base, u32 aux, unsigned num_lock)
+static void l2c220_enable(void __iomem *base, unsigned num_lock)
 {
 	/*
 	 * Always enable non-secure access to the lockdown registers -
 	 * we write to them as part of the L2C enable sequence so they
 	 * need to be accessible.
 	 */
-	aux |= L220_AUX_CTRL_NS_LOCKDOWN;
+	l2x0_saved_regs.aux_ctrl |= L220_AUX_CTRL_NS_LOCKDOWN;
 
-	l2c_enable(base, aux, num_lock);
+	l2c_enable(base, num_lock);
+}
+
+static void l2c220_unlock(void __iomem *base, unsigned num_lock)
+{
+	if (readl_relaxed(base + L2X0_AUX_CTRL) & L220_AUX_CTRL_NS_LOCKDOWN)
+		l2c_unlock(base, num_lock);
 }
 
 static const struct l2c_init_data l2c220_data = {
@@ -409,6 +412,8 @@ static const struct l2c_init_data l2c220_data = {
 	.num_lock = 1,
 	.enable = l2c220_enable,
 	.save = l2c_save,
+	.configure = l2c_configure,
+	.unlock = l2c220_unlock,
 	.outer_cache = {
 		.inv_range = l2c220_inv_range,
 		.clean_range = l2c220_clean_range,
@@ -569,6 +574,8 @@ static void l2c310_configure(void __iomem *base)
 {
 	unsigned revision;
 
+	l2c_configure(base);
+
 	/* restore pl310 setup */
 	l2c_write_sec(l2x0_saved_regs.tag_latency, base,
 		      L310_TAG_LATENCY_CTRL);
@@ -603,10 +610,11 @@ static int l2c310_cpu_enable_flz(struct notifier_block *nb, unsigned long act, v
 	return NOTIFY_OK;
 }
 
-static void __init l2c310_enable(void __iomem *base, u32 aux, unsigned num_lock)
+static void __init l2c310_enable(void __iomem *base, unsigned num_lock)
 {
 	unsigned rev = readl_relaxed(base + L2X0_CACHE_ID) & L2X0_CACHE_ID_RTL_MASK;
 	bool cortex_a9 = read_cpuid_part() == ARM_CPU_PART_CORTEX_A9;
+	u32 aux = l2x0_saved_regs.aux_ctrl;
 
 	if (rev >= L310_CACHE_ID_RTL_R2P0) {
 		if (cortex_a9) {
@@ -649,9 +657,9 @@ static void __init l2c310_enable(void __iomem *base, u32 aux, unsigned num_lock)
 	 * we write to them as part of the L2C enable sequence so they
 	 * need to be accessible.
 	 */
-	aux |= L310_AUX_CTRL_NS_LOCKDOWN;
+	l2x0_saved_regs.aux_ctrl = aux | L310_AUX_CTRL_NS_LOCKDOWN;
 
-	l2c_enable(base, aux, num_lock);
+	l2c_enable(base, num_lock);
 
 	/* Read back resulting AUX_CTRL value as it could have been altered. */
 	aux = readl_relaxed(base + L2X0_AUX_CTRL);
@@ -755,6 +763,12 @@ static void l2c310_resume(void)
 		set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1));
 }
 
+static void l2c310_unlock(void __iomem *base, unsigned num_lock)
+{
+	if (readl_relaxed(base + L2X0_AUX_CTRL) & L310_AUX_CTRL_NS_LOCKDOWN)
+		l2c_unlock(base, num_lock);
+}
+
 static const struct l2c_init_data l2c310_init_fns __initconst = {
 	.type = "L2C-310",
 	.way_size_0 = SZ_8K,
@@ -763,6 +777,7 @@ static const struct l2c_init_data l2c310_init_fns __initconst = {
 	.fixup = l2c310_fixup,
 	.save = l2c310_save,
 	.configure = l2c310_configure,
+	.unlock = l2c310_unlock,
 	.outer_cache = {
 		.inv_range = l2c210_inv_range,
 		.clean_range = l2c210_clean_range,
@@ -856,8 +871,11 @@ static int __init __l2c_init(const struct l2c_init_data *data,
 	 * Check if l2x0 controller is already enabled.  If we are booting
 	 * in non-secure mode accessing the below registers will fault.
 	 */
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
-		data->enable(l2x0_base, aux, data->num_lock);
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		l2x0_saved_regs.aux_ctrl = aux;
+
+		data->enable(l2x0_base, data->num_lock);
+	}
 
 	outer_cache = fns;
 
@@ -1066,6 +1084,8 @@ static const struct l2c_init_data of_l2c210_data __initconst = {
 	.of_parse = l2x0_of_parse,
 	.enable = l2c_enable,
 	.save = l2c_save,
+	.configure = l2c_configure,
+	.unlock = l2c_unlock,
 	.outer_cache = {
 		.inv_range   = l2c210_inv_range,
 		.clean_range = l2c210_clean_range,
@@ -1084,6 +1104,8 @@ static const struct l2c_init_data of_l2c220_data __initconst = {
 	.of_parse = l2x0_of_parse,
 	.enable = l2c220_enable,
 	.save = l2c_save,
+	.configure = l2c_configure,
+	.unlock = l2c220_unlock,
 	.outer_cache = {
 		.inv_range   = l2c220_inv_range,
 		.clean_range = l2c220_clean_range,
@@ -1199,6 +1221,26 @@ static void __init l2c310_of_parse(const struct device_node *np,
 		pr_err("L2C-310 OF arm,prefetch-offset property value is missing\n");
 	}
 
+	ret = of_property_read_u32(np, "prefetch-data", &val);
+	if (ret == 0) {
+		if (val)
+			prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
+		else
+			prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
+	} else if (ret != -EINVAL) {
+		pr_err("L2C-310 OF prefetch-data property value is missing\n");
+	}
+
+	ret = of_property_read_u32(np, "prefetch-instr", &val);
+	if (ret == 0) {
+		if (val)
+			prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
+		else
+			prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
+	} else if (ret != -EINVAL) {
+		pr_err("L2C-310 OF prefetch-instr property value is missing\n");
+	}
+
 	l2x0_saved_regs.prefetch_ctrl = prefetch;
 }
 
@@ -1211,6 +1253,7 @@ static const struct l2c_init_data of_l2c310_data __initconst = {
 	.fixup = l2c310_fixup,
 	.save  = l2c310_save,
 	.configure = l2c310_configure,
+	.unlock = l2c310_unlock,
 	.outer_cache = {
 		.inv_range   = l2c210_inv_range,
 		.clean_range = l2c210_clean_range,
@@ -1240,6 +1283,7 @@ static const struct l2c_init_data of_l2c310_coherent_data __initconst = {
 	.fixup = l2c310_fixup,
 	.save  = l2c310_save,
 	.configure = l2c310_configure,
+	.unlock = l2c310_unlock,
 	.outer_cache = {
 		.inv_range   = l2c210_inv_range,
 		.clean_range = l2c210_clean_range,
@@ -1366,7 +1410,7 @@ static void aurora_save(void __iomem *base)
  * For Aurora cache in no outer mode, enable via the CP15 coprocessor
  * broadcasting of cache commands to L2.
  */
-static void __init aurora_enable_no_outer(void __iomem *base, u32 aux,
+static void __init aurora_enable_no_outer(void __iomem *base,
 	unsigned num_lock)
 {
 	u32 u;
@@ -1377,7 +1421,7 @@ static void __init aurora_enable_no_outer(void __iomem *base, u32 aux,
 
 	isb();
 
-	l2c_enable(base, aux, num_lock);
+	l2c_enable(base, num_lock);
 }
 
 static void __init aurora_fixup(void __iomem *base, u32 cache_id,
@@ -1416,6 +1460,8 @@ static const struct l2c_init_data of_aurora_with_outer_data __initconst = {
 	.enable = l2c_enable,
 	.fixup = aurora_fixup,
 	.save  = aurora_save,
+	.configure = l2c_configure,
+	.unlock = l2c_unlock,
 	.outer_cache = {
 		.inv_range   = aurora_inv_range,
 		.clean_range = aurora_clean_range,
@@ -1435,6 +1481,8 @@ static const struct l2c_init_data of_aurora_no_outer_data __initconst = {
 	.enable = aurora_enable_no_outer,
 	.fixup = aurora_fixup,
 	.save  = aurora_save,
+	.configure = l2c_configure,
+	.unlock = l2c_unlock,
 	.outer_cache = {
 		.resume      = l2c_resume,
 	},
@@ -1585,6 +1633,7 @@ static const struct l2c_init_data of_bcm_l2x0_data __initconst = {
 	.enable = l2c310_enable,
 	.save  = l2c310_save,
 	.configure = l2c310_configure,
+	.unlock = l2c310_unlock,
 	.outer_cache = {
 		.inv_range   = bcm_inv_range,
 		.clean_range = bcm_clean_range,
@@ -1608,6 +1657,7 @@ static void __init tauros3_save(void __iomem *base)
 
 static void tauros3_configure(void __iomem *base)
 {
+	l2c_configure(base);
 	writel_relaxed(l2x0_saved_regs.aux2_ctrl,
 		       base + TAUROS3_AUX2_CTRL);
 	writel_relaxed(l2x0_saved_regs.prefetch_ctrl,
@@ -1621,6 +1671,7 @@ static const struct l2c_init_data of_tauros3_data __initconst = {
 	.enable = l2c_enable,
 	.save  = tauros3_save,
 	.configure = tauros3_configure,
+	.unlock = l2c_unlock,
 	/* Tauros3 broadcasts L1 cache operations to L2 */
 	.outer_cache = {
 		.resume      = l2c_resume,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7e7583ddd607..1ced8a0f7a52 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -148,11 +148,14 @@ static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs);
 static void arm_coherent_dma_free(struct device *dev, size_t size, void *cpu_addr,
 				  dma_addr_t handle, struct dma_attrs *attrs);
+static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		 struct dma_attrs *attrs);
 
 struct dma_map_ops arm_coherent_dma_ops = {
 	.alloc			= arm_coherent_dma_alloc,
 	.free			= arm_coherent_dma_free,
-	.mmap			= arm_dma_mmap,
+	.mmap			= arm_coherent_dma_mmap,
 	.get_sgtable		= arm_dma_get_sgtable,
 	.map_page		= arm_coherent_dma_map_page,
 	.map_sg			= arm_dma_map_sg,
@@ -690,10 +693,7 @@ static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 			   attrs, __builtin_return_address(0));
 }
 
-/*
- * Create userspace mapping for the DMA-coherent memory.
- */
-int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		 struct dma_attrs *attrs)
 {
@@ -704,8 +704,6 @@ int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long pfn = dma_to_pfn(dev, dma_addr);
 	unsigned long off = vma->vm_pgoff;
 
-	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot);
-
 	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
 
@@ -720,6 +718,26 @@ int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	return ret;
 }
 
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ */
+static int arm_coherent_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		 struct dma_attrs *attrs)
+{
+	return __arm_dma_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+
+int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		 struct dma_attrs *attrs)
+{
+#ifdef CONFIG_MMU
+	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot);
+#endif	/* CONFIG_MMU */
+	return __arm_dma_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+
 /*
  * Free a buffer as defined by the above mapping.
  */
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 7186382672b5..6ca7d9aa896f 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1387,123 +1387,98 @@ static void __init map_lowmem(void)
 	}
 }
 
-#ifdef CONFIG_ARM_LPAE
+#ifdef CONFIG_ARM_PV_FIXUP
+extern unsigned long __atags_pointer;
+typedef void pgtables_remap(long long offset, unsigned long pgd, void *bdata);
+pgtables_remap lpae_pgtables_remap_asm;
+
 /*
  * early_paging_init() recreates boot time page table setup, allowing machines
  * to switch over to a high (>4G) address space on LPAE systems
  */
-void __init early_paging_init(const struct machine_desc *mdesc,
-			      struct proc_info_list *procinfo)
+void __init early_paging_init(const struct machine_desc *mdesc)
 {
-	pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
-	unsigned long map_start, map_end;
-	pgd_t *pgd0, *pgdk;
-	pud_t *pud0, *pudk, *pud_start;
-	pmd_t *pmd0, *pmdk;
-	phys_addr_t phys;
-	int i;
+	pgtables_remap *lpae_pgtables_remap;
+	unsigned long pa_pgd;
+	unsigned int cr, ttbcr;
+	long long offset;
+	void *boot_data;
 
-	if (!(mdesc->init_meminfo))
+	if (!mdesc->pv_fixup)
 		return;
 
-	/* remap kernel code and data */
-	map_start = init_mm.start_code & PMD_MASK;
-	map_end   = ALIGN(init_mm.brk, PMD_SIZE);
+	offset = mdesc->pv_fixup();
+	if (offset == 0)
+		return;
 
-	/* get a handle on things... */
-	pgd0 = pgd_offset_k(0);
-	pud_start = pud0 = pud_offset(pgd0, 0);
-	pmd0 = pmd_offset(pud0, 0);
+	/*
+	 * Get the address of the remap function in the 1:1 identity
+	 * mapping setup by the early page table assembly code.  We
+	 * must get this prior to the pv update.  The following barrier
+	 * ensures that this is complete before we fixup any P:V offsets.
+	 */
+	lpae_pgtables_remap = (pgtables_remap *)(unsigned long)__pa(lpae_pgtables_remap_asm);
+	pa_pgd = __pa(swapper_pg_dir);
+	boot_data = __va(__atags_pointer);
+	barrier();
 
-	pgdk = pgd_offset_k(map_start);
-	pudk = pud_offset(pgdk, map_start);
-	pmdk = pmd_offset(pudk, map_start);
+	pr_info("Switching physical address space to 0x%08llx\n",
+		(u64)PHYS_OFFSET + offset);
 
-	mdesc->init_meminfo();
+	/* Re-set the phys pfn offset, and the pv offset */
+	__pv_offset += offset;
+	__pv_phys_pfn_offset += PFN_DOWN(offset);
 
 	/* Run the patch stub to update the constants */
 	fixup_pv_table(&__pv_table_begin,
 		(&__pv_table_end - &__pv_table_begin) << 2);
 
 	/*
-	 * Cache cleaning operations for self-modifying code
-	 * We should clean the entries by MVA but running a
-	 * for loop over every pv_table entry pointer would
-	 * just complicate the code.
-	 */
-	flush_cache_louis();
-	dsb(ishst);
-	isb();
-
-	/*
-	 * FIXME: This code is not architecturally compliant: we modify
-	 * the mappings in-place, indeed while they are in use by this
-	 * very same code.  This may lead to unpredictable behaviour of
-	 * the CPU.
-	 *
-	 * Even modifying the mappings in a separate page table does
-	 * not resolve this.
-	 *
-	 * The architecture strongly recommends that when a mapping is
-	 * changed, that it is changed by first going via an invalid
-	 * mapping and back to the new mapping.  This is to ensure that
-	 * no TLB conflicts (caused by the TLB having more than one TLB
-	 * entry match a translation) can occur.  However, doing that
-	 * here will result in unmapping the code we are running.
-	 */
-	pr_warn("WARNING: unsafe modification of in-place page tables - tainting kernel\n");
-	add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
-
-	/*
-	 * Remap level 1 table.  This changes the physical addresses
-	 * used to refer to the level 2 page tables to the high
-	 * physical address alias, leaving everything else the same.
-	 */
-	for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
-		set_pud(pud0,
-			__pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
-		pmd0 += PTRS_PER_PMD;
-	}
-
-	/*
-	 * Remap the level 2 table, pointing the mappings at the high
-	 * physical address alias of these pages.
-	 */
-	phys = __pa(map_start);
-	do {
-		*pmdk++ = __pmd(phys | pmdprot);
-		phys += PMD_SIZE;
-	} while (phys < map_end);
-
-	/*
-	 * Ensure that the above updates are flushed out of the cache.
-	 * This is not strictly correct; on a system where the caches
-	 * are coherent with each other, but the MMU page table walks
-	 * may not be coherent, flush_cache_all() may be a no-op, and
-	 * this will fail.
+	 * We changing not only the virtual to physical mapping, but also
+	 * the physical addresses used to access memory.  We need to flush
+	 * all levels of cache in the system with caching disabled to
+	 * ensure that all data is written back, and nothing is prefetched
+	 * into the caches.  We also need to prevent the TLB walkers
+	 * allocating into the caches too.  Note that this is ARMv7 LPAE
+	 * specific.
 	 */
+	cr = get_cr();
+	set_cr(cr & ~(CR_I | CR_C));
+	asm("mrc p15, 0, %0, c2, c0, 2" : "=r" (ttbcr));
+	asm volatile("mcr p15, 0, %0, c2, c0, 2"
+		: : "r" (ttbcr & ~(3 << 8 | 3 << 10)));
 	flush_cache_all();
 
 	/*
-	 * Re-write the TTBR values to point them at the high physical
-	 * alias of the page tables.  We expect __va() will work on
-	 * cpu_get_pgd(), which returns the value of TTBR0.
+	 * Fixup the page tables - this must be in the idmap region as
+	 * we need to disable the MMU to do this safely, and hence it
+	 * needs to be assembly.  It's fairly simple, as we're using the
+	 * temporary tables setup by the initial assembly code.
 	 */
-	cpu_switch_mm(pgd0, &init_mm);
-	cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+	lpae_pgtables_remap(offset, pa_pgd, boot_data);
 
-	/* Finally flush any stale TLB values. */
-	local_flush_bp_all();
-	local_flush_tlb_all();
+	/* Re-enable the caches and cacheable TLB walks */
+	asm volatile("mcr p15, 0, %0, c2, c0, 2" : : "r" (ttbcr));
+	set_cr(cr);
 }
 
 #else
 
-void __init early_paging_init(const struct machine_desc *mdesc,
-			      struct proc_info_list *procinfo)
+void __init early_paging_init(const struct machine_desc *mdesc)
 {
-	if (mdesc->init_meminfo)
-		mdesc->init_meminfo();
+	long long offset;
+
+	if (!mdesc->pv_fixup)
+		return;
+
+	offset = mdesc->pv_fixup();
+	if (offset == 0)
+		return;
+
+	pr_crit("Physical address space modification is only to support Keystone2.\n");
+	pr_crit("Please enable ARM_LPAE and ARM_PATCH_PHYS_VIRT support to use this\n");
+	pr_crit("feature. Your kernel may crash now, have a good day.\n");
+	add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
 }
 
 #endif
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index a014dfacd5ca..afd7e05d95f1 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -303,15 +303,6 @@ void __init sanity_check_meminfo(void)
 	memblock_set_current_limit(end);
 }
 
-/*
- * early_paging_init() recreates boot time page table setup, allowing machines
- * to switch over to a high (>4G) address space on LPAE systems
- */
-void __init early_paging_init(const struct machine_desc *mdesc,
-			      struct proc_info_list *procinfo)
-{
-}
-
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps, and sets up the zero page, bad page and bad page tables.
diff --git a/arch/arm/mm/proc-v7-2level.S b/arch/arm/mm/proc-v7-2level.S
index 10405b8d31af..c6141a5435c3 100644
--- a/arch/arm/mm/proc-v7-2level.S
+++ b/arch/arm/mm/proc-v7-2level.S
@@ -36,14 +36,16 @@
  *
  *	It is assumed that:
  *	- we are not using split page tables
+ *
+ *	Note that we always need to flush BTAC/BTB if IBE is set
+ *	even on Cortex-A8 revisions not affected by 430973.
+ *	If IBE is not set, the flush BTAC/BTB won't do anything.
  */
 ENTRY(cpu_ca8_switch_mm)
 #ifdef CONFIG_MMU
 	mov	r2, #0
-#ifdef CONFIG_ARM_ERRATA_430973
 	mcr	p15, 0, r2, c7, c5, 6		@ flush BTAC/BTB
 #endif
-#endif
 ENTRY(cpu_v7_switch_mm)
 #ifdef CONFIG_MMU
 	mmid	r1, r1				@ get mm->context.id
@@ -148,10 +150,10 @@ ENDPROC(cpu_v7_set_pte_ext)
 	 * Macro for setting up the TTBRx and TTBCR registers.
 	 * - \ttb0 and \ttb1 updated with the corresponding flags.
 	 */
-	.macro	v7_ttb_setup, zero, ttbr0, ttbr1, tmp
+	.macro	v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
 	mcr	p15, 0, \zero, c2, c0, 2	@ TTB control register
-	ALT_SMP(orr	\ttbr0, \ttbr0, #TTB_FLAGS_SMP)
-	ALT_UP(orr	\ttbr0, \ttbr0, #TTB_FLAGS_UP)
+	ALT_SMP(orr	\ttbr0l, \ttbr0l, #TTB_FLAGS_SMP)
+	ALT_UP(orr	\ttbr0l, \ttbr0l, #TTB_FLAGS_UP)
 	ALT_SMP(orr	\ttbr1, \ttbr1, #TTB_FLAGS_SMP)
 	ALT_UP(orr	\ttbr1, \ttbr1, #TTB_FLAGS_UP)
 	mcr	p15, 0, \ttbr1, c2, c0, 1	@ load TTB1
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index d3daed0ae0ad..5e5720e8bc5f 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -126,11 +126,10 @@ ENDPROC(cpu_v7_set_pte_ext)
 	 * Macro for setting up the TTBRx and TTBCR registers.
 	 * - \ttbr1 updated.
 	 */
-	.macro	v7_ttb_setup, zero, ttbr0, ttbr1, tmp
+	.macro	v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
 	ldr	\tmp, =swapper_pg_dir		@ swapper_pg_dir virtual address
-	mov	\tmp, \tmp, lsr #ARCH_PGD_SHIFT
-	cmp	\ttbr1, \tmp			@ PHYS_OFFSET > PAGE_OFFSET?
-	mrc	p15, 0, \tmp, c2, c0, 2		@ TTB control register
+	cmp	\ttbr1, \tmp, lsr #12		@ PHYS_OFFSET > PAGE_OFFSET?
+	mrc	p15, 0, \tmp, c2, c0, 2		@ TTB control egister
 	orr	\tmp, \tmp, #TTB_EAE
 	ALT_SMP(orr	\tmp, \tmp, #TTB_FLAGS_SMP)
 	ALT_UP(orr	\tmp, \tmp, #TTB_FLAGS_UP)
@@ -143,13 +142,10 @@ ENDPROC(cpu_v7_set_pte_ext)
 	 */
 	orrls	\tmp, \tmp, #TTBR1_SIZE				@ TTBCR.T1SZ
 	mcr	p15, 0, \tmp, c2, c0, 2				@ TTBCR
-	mov	\tmp, \ttbr1, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
-	mov	\ttbr1, \ttbr1, lsl #ARCH_PGD_SHIFT		@ lower bits
+	mov	\tmp, \ttbr1, lsr #20
+	mov	\ttbr1, \ttbr1, lsl #12
 	addls	\ttbr1, \ttbr1, #TTBR1_OFFSET
 	mcrr	p15, 1, \ttbr1, \tmp, c2			@ load TTBR1
-	mov	\tmp, \ttbr0, lsr #(32 - ARCH_PGD_SHIFT)	@ upper bits
-	mov	\ttbr0, \ttbr0, lsl #ARCH_PGD_SHIFT		@ lower bits
-	mcrr	p15, 0, \ttbr0, \tmp, c2			@ load TTBR0
 	.endm
 
 	/*
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 75ae72160099..0716bbe19872 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -252,6 +252,12 @@ ENDPROC(cpu_pj4b_do_resume)
  *	Initialise TLB, Caches, and MMU state ready to switch the MMU
  *	on.  Return in r0 the new CP15 C1 control register setting.
  *
+ *	r1, r2, r4, r5, r9, r13 must be preserved - r13 is not a stack
+ *	r4: TTBR0 (low word)
+ *	r5: TTBR0 (high word if LPAE)
+ *	r8: TTBR1
+ *	r9: Main ID register
+ *
  *	This should be able to cover all ARMv7 cores.
  *
  *	It is assumed that:
@@ -279,6 +285,78 @@ __v7_ca17mp_setup:
 #endif
 	b	__v7_setup
 
+/*
+ * Errata:
+ *  r0, r10 available for use
+ *  r1, r2, r4, r5, r9, r13: must be preserved
+ *  r3: contains MIDR rX number in bits 23-20
+ *  r6: contains MIDR rXpY as 8-bit XY number
+ *  r9: MIDR
+ */
+__ca8_errata:
+#if defined(CONFIG_ARM_ERRATA_430973) && !defined(CONFIG_ARCH_MULTIPLATFORM)
+	teq	r3, #0x00100000			@ only present in r1p*
+	mrceq	p15, 0, r0, c1, c0, 1		@ read aux control register
+	orreq	r0, r0, #(1 << 6)		@ set IBE to 1
+	mcreq	p15, 0, r0, c1, c0, 1		@ write aux control register
+#endif
+#ifdef CONFIG_ARM_ERRATA_458693
+	teq	r6, #0x20			@ only present in r2p0
+	mrceq	p15, 0, r0, c1, c0, 1		@ read aux control register
+	orreq	r0, r0, #(1 << 5)		@ set L1NEON to 1
+	orreq	r0, r0, #(1 << 9)		@ set PLDNOP to 1
+	mcreq	p15, 0, r0, c1, c0, 1		@ write aux control register
+#endif
+#ifdef CONFIG_ARM_ERRATA_460075
+	teq	r6, #0x20			@ only present in r2p0
+	mrceq	p15, 1, r0, c9, c0, 2		@ read L2 cache aux ctrl register
+	tsteq	r0, #1 << 22
+	orreq	r0, r0, #(1 << 22)		@ set the Write Allocate disable bit
+	mcreq	p15, 1, r0, c9, c0, 2		@ write the L2 cache aux ctrl register
+#endif
+	b	__errata_finish
+
+__ca9_errata:
+#ifdef CONFIG_ARM_ERRATA_742230
+	cmp	r6, #0x22			@ only present up to r2p2
+	mrcle	p15, 0, r0, c15, c0, 1		@ read diagnostic register
+	orrle	r0, r0, #1 << 4			@ set bit #4
+	mcrle	p15, 0, r0, c15, c0, 1		@ write diagnostic register
+#endif
+#ifdef CONFIG_ARM_ERRATA_742231
+	teq	r6, #0x20			@ present in r2p0
+	teqne	r6, #0x21			@ present in r2p1
+	teqne	r6, #0x22			@ present in r2p2
+	mrceq	p15, 0, r0, c15, c0, 1		@ read diagnostic register
+	orreq	r0, r0, #1 << 12		@ set bit #12
+	orreq	r0, r0, #1 << 22		@ set bit #22
+	mcreq	p15, 0, r0, c15, c0, 1		@ write diagnostic register
+#endif
+#ifdef CONFIG_ARM_ERRATA_743622
+	teq	r3, #0x00200000			@ only present in r2p*
+	mrceq	p15, 0, r0, c15, c0, 1		@ read diagnostic register
+	orreq	r0, r0, #1 << 6			@ set bit #6
+	mcreq	p15, 0, r0, c15, c0, 1		@ write diagnostic register
+#endif
+#if defined(CONFIG_ARM_ERRATA_751472) && defined(CONFIG_SMP)
+	ALT_SMP(cmp r6, #0x30)			@ present prior to r3p0
+	ALT_UP_B(1f)
+	mrclt	p15, 0, r0, c15, c0, 1		@ read diagnostic register
+	orrlt	r0, r0, #1 << 11		@ set bit #11
+	mcrlt	p15, 0, r0, c15, c0, 1		@ write diagnostic register
+1:
+#endif
+	b	__errata_finish
+
+__ca15_errata:
+#ifdef CONFIG_ARM_ERRATA_773022
+	cmp	r6, #0x4			@ only present up to r0p4
+	mrcle	p15, 0, r0, c1, c0, 1		@ read aux control register
+	orrle	r0, r0, #1 << 1			@ disable loop buffer
+	mcrle	p15, 0, r0, c1, c0, 1		@ write aux control register
+#endif
+	b	__errata_finish
+
 __v7_pj4b_setup:
 #ifdef CONFIG_CPU_PJ4B
 
@@ -339,96 +417,38 @@ __v7_setup:
 	bl      v7_invalidate_l1
 	ldmia	r12, {r0-r5, r7, r9, r11, lr}
 
-	mrc	p15, 0, r0, c0, c0, 0		@ read main ID register
-	and	r10, r0, #0xff000000		@ ARM?
-	teq	r10, #0x41000000
-	bne	3f
-	and	r5, r0, #0x00f00000		@ variant
-	and	r6, r0, #0x0000000f		@ revision
-	orr	r6, r6, r5, lsr #20-4		@ combine variant and revision
-	ubfx	r0, r0, #4, #12			@ primary part number
+	and	r0, r9, #0xff000000		@ ARM?
+	teq	r0, #0x41000000
+	bne	__errata_finish
+	and	r3, r9, #0x00f00000		@ variant
+	and	r6, r9, #0x0000000f		@ revision
+	orr	r6, r6, r3, lsr #20-4		@ combine variant and revision
+	ubfx	r0, r9, #4, #12			@ primary part number
 
 	/* Cortex-A8 Errata */
 	ldr	r10, =0x00000c08		@ Cortex-A8 primary part number
 	teq	r0, r10
-	bne	2f
-#if defined(CONFIG_ARM_ERRATA_430973) && !defined(CONFIG_ARCH_MULTIPLATFORM)
-
-	teq	r5, #0x00100000			@ only present in r1p*
-	mrceq	p15, 0, r10, c1, c0, 1		@ read aux control register
-	orreq	r10, r10, #(1 << 6)		@ set IBE to 1
-	mcreq	p15, 0, r10, c1, c0, 1		@ write aux control register
-#endif
-#ifdef CONFIG_ARM_ERRATA_458693
-	teq	r6, #0x20			@ only present in r2p0
-	mrceq	p15, 0, r10, c1, c0, 1		@ read aux control register
-	orreq	r10, r10, #(1 << 5)		@ set L1NEON to 1
-	orreq	r10, r10, #(1 << 9)		@ set PLDNOP to 1
-	mcreq	p15, 0, r10, c1, c0, 1		@ write aux control register
-#endif
-#ifdef CONFIG_ARM_ERRATA_460075
-	teq	r6, #0x20			@ only present in r2p0
-	mrceq	p15, 1, r10, c9, c0, 2		@ read L2 cache aux ctrl register
-	tsteq	r10, #1 << 22
-	orreq	r10, r10, #(1 << 22)		@ set the Write Allocate disable bit
-	mcreq	p15, 1, r10, c9, c0, 2		@ write the L2 cache aux ctrl register
-#endif
-	b	3f
+	beq	__ca8_errata
 
 	/* Cortex-A9 Errata */
-2:	ldr	r10, =0x00000c09		@ Cortex-A9 primary part number
+	ldr	r10, =0x00000c09		@ Cortex-A9 primary part number
 	teq	r0, r10
-	bne	3f
-#ifdef CONFIG_ARM_ERRATA_742230
-	cmp	r6, #0x22			@ only present up to r2p2
-	mrcle	p15, 0, r10, c15, c0, 1		@ read diagnostic register
-	orrle	r10, r10, #1 << 4		@ set bit #4
-	mcrle	p15, 0, r10, c15, c0, 1		@ write diagnostic register
-#endif
-#ifdef CONFIG_ARM_ERRATA_742231
-	teq	r6, #0x20			@ present in r2p0
-	teqne	r6, #0x21			@ present in r2p1
-	teqne	r6, #0x22			@ present in r2p2
-	mrceq	p15, 0, r10, c15, c0, 1		@ read diagnostic register
-	orreq	r10, r10, #1 << 12		@ set bit #12
-	orreq	r10, r10, #1 << 22		@ set bit #22
-	mcreq	p15, 0, r10, c15, c0, 1		@ write diagnostic register
-#endif
-#ifdef CONFIG_ARM_ERRATA_743622
-	teq	r5, #0x00200000			@ only present in r2p*
-	mrceq	p15, 0, r10, c15, c0, 1		@ read diagnostic register
-	orreq	r10, r10, #1 << 6		@ set bit #6
-	mcreq	p15, 0, r10, c15, c0, 1		@ write diagnostic register
-#endif
-#if defined(CONFIG_ARM_ERRATA_751472) && defined(CONFIG_SMP)
-	ALT_SMP(cmp r6, #0x30)			@ present prior to r3p0
-	ALT_UP_B(1f)
-	mrclt	p15, 0, r10, c15, c0, 1		@ read diagnostic register
-	orrlt	r10, r10, #1 << 11		@ set bit #11
-	mcrlt	p15, 0, r10, c15, c0, 1		@ write diagnostic register
-1:
-#endif
+	beq	__ca9_errata
 
 	/* Cortex-A15 Errata */
-3:	ldr	r10, =0x00000c0f		@ Cortex-A15 primary part number
+	ldr	r10, =0x00000c0f		@ Cortex-A15 primary part number
 	teq	r0, r10
-	bne	4f
+	beq	__ca15_errata
 
-#ifdef CONFIG_ARM_ERRATA_773022
-	cmp	r6, #0x4			@ only present up to r0p4
-	mrcle	p15, 0, r10, c1, c0, 1		@ read aux control register
-	orrle	r10, r10, #1 << 1		@ disable loop buffer
-	mcrle	p15, 0, r10, c1, c0, 1		@ write aux control register
-#endif
-
-4:	mov	r10, #0
+__errata_finish:
+	mov	r10, #0
 	mcr	p15, 0, r10, c7, c5, 0		@ I+BTB cache invalidate
 #ifdef CONFIG_MMU
 	mcr	p15, 0, r10, c8, c7, 0		@ invalidate I + D TLBs
-	v7_ttb_setup r10, r4, r8, r5		@ TTBCR, TTBRx setup
-	ldr	r5, =PRRR			@ PRRR
+	v7_ttb_setup r10, r4, r5, r8, r3	@ TTBCR, TTBRx setup
+	ldr	r3, =PRRR			@ PRRR
 	ldr	r6, =NMRR			@ NMRR
-	mcr	p15, 0, r5, c10, c2, 0		@ write PRRR
+	mcr	p15, 0, r3, c10, c2, 0		@ write PRRR
 	mcr	p15, 0, r6, c10, c2, 1		@ write NMRR
 #endif
 	dsb					@ Complete invalidations
@@ -437,22 +457,22 @@ __v7_setup:
 	and	r0, r0, #(0xf << 12)		@ ThumbEE enabled field
 	teq	r0, #(1 << 12)			@ check if ThumbEE is present
 	bne	1f
-	mov	r5, #0
-	mcr	p14, 6, r5, c1, c0, 0		@ Initialize TEEHBR to 0
+	mov	r3, #0
+	mcr	p14, 6, r3, c1, c0, 0		@ Initialize TEEHBR to 0
 	mrc	p14, 6, r0, c0, c0, 0		@ load TEECR
 	orr	r0, r0, #1			@ set the 1st bit in order to
 	mcr	p14, 6, r0, c0, c0, 0		@ stop userspace TEEHBR access
 1:
 #endif
-	adr	r5, v7_crval
-	ldmia	r5, {r5, r6}
+	adr	r3, v7_crval
+	ldmia	r3, {r3, r6}
  ARM_BE8(orr	r6, r6, #1 << 25)		@ big-endian page tables
 #ifdef CONFIG_SWP_EMULATE
-	orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
+	orr     r3, r3, #(1 << 10)              @ set SW bit in "clear"
 	bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
 #endif
    	mrc	p15, 0, r0, c1, c0, 0		@ read control register
-	bic	r0, r0, r5			@ clear bits them
+	bic	r0, r0, r3			@ clear bits them
 	orr	r0, r0, r6			@ set them
  THUMB(	orr	r0, r0, #1 << 30	)	@ Thumb exceptions
 	ret	lr				@ return to head.S:__ret
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
index e08e1f2bab76..67d9209077c6 100644
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -98,7 +98,7 @@ __v7m_setup:
 	str	r5, [r0, V7M_SCB_SHPR3]	@ set PendSV priority
 
 	@ SVC to run the kernel in this mode
-	adr	r1, BSYM(1f)
+	badr	r1, 1f
 	ldr	r5, [r12, #11 * 4]	@ read the SVC vector entry
 	str	r1, [r12, #11 * 4]	@ write the temporary SVC vector entry
 	mov	r6, lr			@ save LR
diff --git a/arch/arm/mm/pv-fixup-asm.S b/arch/arm/mm/pv-fixup-asm.S
new file mode 100644
index 000000000000..1867f3e43016
--- /dev/null
+++ b/arch/arm/mm/pv-fixup-asm.S
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (C) 2015 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This assembly is required to safely remap the physical address space
+ * for Keystone 2
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/cp15.h>
+#include <asm/memory.h>
+#include <asm/pgtable.h>
+
+	.section ".idmap.text", "ax"
+
+#define L1_ORDER 3
+#define L2_ORDER 3
+
+ENTRY(lpae_pgtables_remap_asm)
+	stmfd	sp!, {r4-r8, lr}
+
+	mrc	p15, 0, r8, c1, c0, 0		@ read control reg
+	bic	ip, r8, #CR_M			@ disable caches and MMU
+	mcr	p15, 0, ip, c1, c0, 0
+	dsb
+	isb
+
+	/* Update level 2 entries covering the kernel */
+	ldr	r6, =(_end - 1)
+	add	r7, r2, #0x1000
+	add	r6, r7, r6, lsr #SECTION_SHIFT - L2_ORDER
+	add	r7, r7, #PAGE_OFFSET >> (SECTION_SHIFT - L2_ORDER)
+1:	ldrd	r4, [r7]
+	adds	r4, r4, r0
+	adc	r5, r5, r1
+	strd	r4, [r7], #1 << L2_ORDER
+	cmp	r7, r6
+	bls	1b
+
+	/* Update level 2 entries for the boot data */
+	add	r7, r2, #0x1000
+	add	r7, r7, r3, lsr #SECTION_SHIFT - L2_ORDER
+	bic	r7, r7, #(1 << L2_ORDER) - 1
+	ldrd	r4, [r7]
+	adds	r4, r4, r0
+	adc	r5, r5, r1
+	strd	r4, [r7], #1 << L2_ORDER
+	ldrd	r4, [r7]
+	adds	r4, r4, r0
+	adc	r5, r5, r1
+	strd	r4, [r7]
+
+	/* Update level 1 entries */
+	mov	r6, #4
+	mov	r7, r2
+2:	ldrd	r4, [r7]
+	adds	r4, r4, r0
+	adc	r5, r5, r1
+	strd	r4, [r7], #1 << L1_ORDER
+	subs	r6, r6, #1
+	bne	2b
+
+	mrrc	p15, 0, r4, r5, c2		@ read TTBR0
+	adds	r4, r4, r0			@ update physical address
+	adc	r5, r5, r1
+	mcrr	p15, 0, r4, r5, c2		@ write back TTBR0
+	mrrc	p15, 1, r4, r5, c2		@ read TTBR1
+	adds	r4, r4, r0			@ update physical address
+	adc	r5, r5, r1
+	mcrr	p15, 1, r4, r5, c2		@ write back TTBR1
+
+	dsb
+
+	mov	ip, #0
+	mcr	p15, 0, ip, c7, c5, 0		@ I+BTB cache invalidate
+	mcr	p15, 0, ip, c8, c7, 0		@ local_flush_tlb_all()
+	dsb
+	isb
+
+	mcr	p15, 0, r8, c1, c0, 0		@ re-enable MMU
+	dsb
+	isb
+
+	ldmfd	sp!, {r4-r8, pc}
+ENDPROC(lpae_pgtables_remap_asm)
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 8aa791051029..9d259d94e429 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -6,9 +6,15 @@ obj-vdso := vgettimeofday.o datapage.o
 targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
 obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
 
-ccflags-y := -shared -fPIC -fno-common -fno-builtin -fno-stack-protector
-ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 -DDISABLE_BRANCH_PROFILING
-ccflags-y += -Wl,--no-undefined $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector
+ccflags-y += -DDISABLE_BRANCH_PROFILING
+
+VDSO_LDFLAGS := -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
+VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+VDSO_LDFLAGS += -nostdlib -shared
+VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id)
+VDSO_LDFLAGS += $(call cc-option, -fuse-ld=bfd)
 
 obj-$(CONFIG_VDSO) += vdso.o
 extra-$(CONFIG_VDSO) += vdso.lds
@@ -40,10 +46,8 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 
 # Actual build commands
 quiet_cmd_vdsold = VDSO    $@
-      cmd_vdsold = $(CC) $(c_flags) -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) \
-                   $(call cc-ldoption, -Wl$(comma)--build-id) \
-                   -Wl,-Bsymbolic -Wl,-z,max-page-size=4096 \
-                   -Wl,-z,common-page-size=4096 -o $@
+      cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \
+                   -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
 
 quiet_cmd_vdsomunge = MUNGE   $@
       cmd_vdsomunge = $(objtree)/$(obj)/vdsomunge $< $@
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 352b6a29910f..4e57730e0be4 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -142,6 +142,12 @@ config ARM_GLOBAL_TIMER
 	help
 	  This options enables support for the ARM global timer unit
 
+config ARM_TIMER_SP804
+	bool "Support for Dual Timer SP804 module"
+	depends on GENERIC_SCHED_CLOCK && CLKDEV_LOOKUP
+	select CLKSRC_MMIO
+	select CLKSRC_OF if OF
+
 config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK
 	bool
 	depends on ARM_GLOBAL_TIMER
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index e268b5e1901c..f228354961ca 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MTK_TIMER)		+= mtk_timer.o
 obj-$(CONFIG_ARM_ARCH_TIMER)		+= arm_arch_timer.o
 obj-$(CONFIG_ARM_GLOBAL_TIMER)		+= arm_global_timer.o
 obj-$(CONFIG_ARMV7M_SYSTICK)		+= armv7m_systick.o
+obj-$(CONFIG_ARM_TIMER_SP804)		+= timer-sp804.o
 obj-$(CONFIG_CLKSRC_METAG_GENERIC)	+= metag_generic.o
 obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST)	+= dummy_timer.o
 obj-$(CONFIG_ARCH_KEYSTONE)		+= timer-keystone.o
diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c
index c97d1980c0f8..a68866e0ecd4 100644
--- a/drivers/clocksource/timer-integrator-ap.c
+++ b/drivers/clocksource/timer-integrator-ap.c
@@ -26,7 +26,8 @@
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/sched_clock.h>
-#include <asm/hardware/arm_timer.h>
+
+#include "timer-sp.h"
 
 static void __iomem * sched_clk_base;
 
diff --git a/arch/arm/include/asm/hardware/arm_timer.h b/drivers/clocksource/timer-sp.h
similarity index 93%
rename from arch/arm/include/asm/hardware/arm_timer.h
rename to drivers/clocksource/timer-sp.h
index d6030ff599db..050d88561e9c 100644
--- a/arch/arm/include/asm/hardware/arm_timer.h
+++ b/drivers/clocksource/timer-sp.h
@@ -1,6 +1,3 @@
-#ifndef __ASM_ARM_HARDWARE_ARM_TIMER_H
-#define __ASM_ARM_HARDWARE_ARM_TIMER_H
-
 /*
  * ARM timer implementation, found in Integrator, Versatile and Realview
  * platforms.  Not all platforms support all registers and bits in these
@@ -31,5 +28,3 @@
 #define TIMER_RIS	0x10			/*  CVR ro */
 #define TIMER_MIS	0x14			/*  CVR ro */
 #define TIMER_BGLOAD	0x18			/*  CVR rw */
-
-#endif
diff --git a/arch/arm/common/timer-sp.c b/drivers/clocksource/timer-sp804.c
similarity index 97%
rename from arch/arm/common/timer-sp.c
rename to drivers/clocksource/timer-sp804.c
index 19211324772f..ca02503f17d1 100644
--- a/arch/arm/common/timer-sp.c
+++ b/drivers/clocksource/timer-sp804.c
@@ -1,5 +1,5 @@
 /*
- *  linux/arch/arm/common/timer-sp.c
+ *  linux/drivers/clocksource/timer-sp.c
  *
  *  Copyright (C) 1999 - 2003 ARM Limited
  *  Copyright (C) 2000 Deep Blue Solutions Ltd
@@ -30,8 +30,9 @@
 #include <linux/of_irq.h>
 #include <linux/sched_clock.h>
 
-#include <asm/hardware/arm_timer.h>
-#include <asm/hardware/timer-sp.h>
+#include <clocksource/timer-sp804.h>
+
+#include "timer-sp.h"
 
 static long __init sp804_get_clock_rate(struct clk *clk)
 {
@@ -71,6 +72,11 @@ static u64 notrace sp804_read(void)
 	return ~readl_relaxed(sched_clock_base + TIMER_VALUE);
 }
 
+void __init sp804_timer_disable(void __iomem *base)
+{
+	writel(0, base + TIMER_CTRL);
+}
+
 void __init __sp804_clocksource_and_sched_clock_init(void __iomem *base,
 						     const char *name,
 						     struct clk *clk,
diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c
index 40c34faffe59..db2ede565f1a 100644
--- a/drivers/cpuidle/cpuidle-big_little.c
+++ b/drivers/cpuidle/cpuidle-big_little.c
@@ -108,13 +108,7 @@ static int notrace bl_powerdown_finisher(unsigned long arg)
 	unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 
 	mcpm_set_entry_vector(cpu, cluster, cpu_resume);
-
-	/*
-	 * Residency value passed to mcpm_cpu_suspend back-end
-	 * has to be given clear semantics. Set to 0 as a
-	 * temporary value.
-	 */
-	mcpm_cpu_suspend(0);
+	mcpm_cpu_suspend();
 
 	/* return value != 0 means failure */
 	return 1;
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index f8efb7087760..5c9adf1f554d 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_ARCH_MEDIATEK)		+= irq-mtk-sysirq.o
 obj-$(CONFIG_ARCH_DIGICOLOR)		+= irq-digicolor.o
 obj-$(CONFIG_RENESAS_H8300H_INTC)	+= irq-renesas-h8300h.o
 obj-$(CONFIG_RENESAS_H8S_INTC)		+= irq-renesas-h8s.o
+obj-$(CONFIG_ARCH_SA1100)		+= irq-sa11x0.o
diff --git a/arch/arm/mach-sa1100/irq.c b/drivers/irqchip/irq-sa11x0.c
similarity index 64%
rename from arch/arm/mach-sa1100/irq.c
rename to drivers/irqchip/irq-sa11x0.c
index 65aebfa66fe5..46df2875dc1c 100644
--- a/arch/arm/mach-sa1100/irq.c
+++ b/drivers/irqchip/irq-sa11x0.c
@@ -1,9 +1,8 @@
 /*
- * linux/arch/arm/mach-sa1100/irq.c
- *
+ * Copyright (C) 2015 Dmitry Eremin-Solenikov
  * Copyright (C) 1999-2001 Nicolas Pitre
  *
- * Generic IRQ handling for the SA11x0, GPIO 11-27 IRQ demultiplexing.
+ * Generic IRQ handling for the SA11x0.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -15,16 +14,21 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
-#include <linux/ioport.h>
 #include <linux/syscore_ops.h>
+#include <linux/irqchip/irq-sa11x0.h>
+
+#include <soc/sa1100/pwer.h>
 
-#include <mach/hardware.h>
-#include <mach/irqs.h>
-#include <asm/mach/irq.h>
 #include <asm/exception.h>
 
-#include "generic.h"
+#define ICIP	0x00  /* IC IRQ Pending reg. */
+#define ICMR	0x04  /* IC Mask Reg.        */
+#define ICLR	0x08  /* IC Level Reg.       */
+#define ICCR	0x0C  /* IC Control Reg.     */
+#define ICFP	0x10  /* IC FIQ Pending reg. */
+#define ICPR	0x20  /* IC Pending Reg.     */
 
+static void __iomem *iobase;
 
 /*
  * We don't need to ACK IRQs on the SA1100 unless they're GPIOs
@@ -32,27 +36,25 @@
  */
 static void sa1100_mask_irq(struct irq_data *d)
 {
-	ICMR &= ~BIT(d->hwirq);
+	u32 reg;
+
+	reg = readl_relaxed(iobase + ICMR);
+	reg &= ~BIT(d->hwirq);
+	writel_relaxed(reg, iobase + ICMR);
 }
 
 static void sa1100_unmask_irq(struct irq_data *d)
 {
-	ICMR |= BIT(d->hwirq);
+	u32 reg;
+
+	reg = readl_relaxed(iobase + ICMR);
+	reg |= BIT(d->hwirq);
+	writel_relaxed(reg, iobase + ICMR);
 }
 
-/*
- * Apart form GPIOs, only the RTC alarm can be a wakeup event.
- */
 static int sa1100_set_wake(struct irq_data *d, unsigned int on)
 {
-	if (BIT(d->hwirq) == IC_RTCAlrm) {
-		if (on)
-			PWER |= PWER_RTC;
-		else
-			PWER &= ~PWER_RTC;
-		return 0;
-	}
-	return -EINVAL;
+	return sa11x0_sc_set_wake(d->hwirq, on);
 }
 
 static struct irq_chip sa1100_normal_chip = {
@@ -73,16 +75,13 @@ static int sa1100_normal_irqdomain_map(struct irq_domain *d,
 	return 0;
 }
 
-static struct irq_domain_ops sa1100_normal_irqdomain_ops = {
+static const struct irq_domain_ops sa1100_normal_irqdomain_ops = {
 	.map = sa1100_normal_irqdomain_map,
 	.xlate = irq_domain_xlate_onetwocell,
 };
 
 static struct irq_domain *sa1100_normal_irqdomain;
 
-static struct resource irq_resource =
-	DEFINE_RES_MEM_NAMED(0x90050000, SZ_64K, "irqs");
-
 static struct sa1100irq_state {
 	unsigned int	saved;
 	unsigned int	icmr;
@@ -95,16 +94,14 @@ static int sa1100irq_suspend(void)
 	struct sa1100irq_state *st = &sa1100irq_state;
 
 	st->saved = 1;
-	st->icmr = ICMR;
-	st->iclr = ICLR;
-	st->iccr = ICCR;
+	st->icmr = readl_relaxed(iobase + ICMR);
+	st->iclr = readl_relaxed(iobase + ICLR);
+	st->iccr = readl_relaxed(iobase + ICCR);
 
 	/*
 	 * Disable all GPIO-based interrupts.
 	 */
-	ICMR &= ~(IC_GPIO11_27|IC_GPIO10|IC_GPIO9|IC_GPIO8|IC_GPIO7|
-		  IC_GPIO6|IC_GPIO5|IC_GPIO4|IC_GPIO3|IC_GPIO2|
-		  IC_GPIO1|IC_GPIO0);
+	writel_relaxed(st->icmr & 0xfffff000, iobase + ICMR);
 
 	return 0;
 }
@@ -114,10 +111,10 @@ static void sa1100irq_resume(void)
 	struct sa1100irq_state *st = &sa1100irq_state;
 
 	if (st->saved) {
-		ICCR = st->iccr;
-		ICLR = st->iclr;
+		writel_relaxed(st->iccr, iobase + ICCR);
+		writel_relaxed(st->iclr, iobase + ICLR);
 
-		ICMR = st->icmr;
+		writel_relaxed(st->icmr, iobase + ICMR);
 	}
 }
 
@@ -140,8 +137,8 @@ sa1100_handle_irq(struct pt_regs *regs)
 	uint32_t icip, icmr, mask;
 
 	do {
-		icip = (ICIP);
-		icmr = (ICMR);
+		icip = readl_relaxed(iobase + ICIP);
+		icmr = readl_relaxed(iobase + ICMR);
 		mask = icip & icmr;
 
 		if (mask == 0)
@@ -152,27 +149,27 @@ sa1100_handle_irq(struct pt_regs *regs)
 	} while (1);
 }
 
-void __init sa1100_init_irq(void)
+void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start)
 {
-	request_resource(&iomem_resource, &irq_resource);
+	iobase = ioremap(io_start, SZ_64K);
+	if (WARN_ON(!iobase))
+		return;
 
 	/* disable all IRQs */
-	ICMR = 0;
+	writel_relaxed(0, iobase + ICMR);
 
 	/* all IRQs are IRQ, not FIQ */
-	ICLR = 0;
+	writel_relaxed(0, iobase + ICLR);
 
 	/*
 	 * Whatever the doc says, this has to be set for the wait-on-irq
 	 * instruction to work... on a SA1100 rev 9 at least.
 	 */
-	ICCR = 1;
+	writel_relaxed(1, iobase + ICCR);
 
 	sa1100_normal_irqdomain = irq_domain_add_simple(NULL,
-			32, IRQ_GPIO0_SC,
+			32, irq_start,
 			&sa1100_normal_irqdomain_ops, NULL);
 
 	set_handle_irq(sa1100_handle_irq);
-
-	sa1100_init_gpio();
 }
diff --git a/arch/arm/include/asm/hardware/timer-sp.h b/include/clocksource/timer-sp804.h
similarity index 85%
rename from arch/arm/include/asm/hardware/timer-sp.h
rename to include/clocksource/timer-sp804.h
index bb28af7c32de..1f8a1caa7cb4 100644
--- a/arch/arm/include/asm/hardware/timer-sp.h
+++ b/include/clocksource/timer-sp804.h
@@ -1,9 +1,13 @@
+#ifndef __CLKSOURCE_TIMER_SP804_H
+#define __CLKSOURCE_TIMER_SP804_H
+
 struct clk;
 
 void __sp804_clocksource_and_sched_clock_init(void __iomem *,
 					      const char *, struct clk *, int);
 void __sp804_clockevents_init(void __iomem *, unsigned int,
 			      struct clk *, const char *);
+void sp804_timer_disable(void __iomem *);
 
 static inline void sp804_clocksource_init(void __iomem *base, const char *name)
 {
@@ -21,3 +25,4 @@ static inline void sp804_clockevents_init(void __iomem *base, unsigned int irq,
 	__sp804_clockevents_init(base, irq, NULL, name);
 
 }
+#endif
diff --git a/include/linux/irqchip/irq-sa11x0.h b/include/linux/irqchip/irq-sa11x0.h
new file mode 100644
index 000000000000..15db6829c1e4
--- /dev/null
+++ b/include/linux/irqchip/irq-sa11x0.h
@@ -0,0 +1,17 @@
+/*
+ * Generic IRQ handling for the SA11x0.
+ *
+ * Copyright (C) 2015 Dmitry Eremin-Solenikov
+ * Copyright (C) 1999-2001 Nicolas Pitre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H
+#define __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H
+
+void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start);
+
+#endif
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1b82d44b0a02..3d80c432ede7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -300,6 +300,11 @@ struct pmu {
 	 * Free pmu-private AUX data structures
 	 */
 	void (*free_aux)		(void *aux); /* optional */
+
+	/*
+	 * Filter events for PMU-specific reasons.
+	 */
+	int (*filter_match)		(struct perf_event *event); /* optional */
 };
 
 /**
diff --git a/include/soc/sa1100/pwer.h b/include/soc/sa1100/pwer.h
new file mode 100644
index 000000000000..15a545b5a1f6
--- /dev/null
+++ b/include/soc/sa1100/pwer.h
@@ -0,0 +1,15 @@
+#ifndef SOC_SA1100_PWER_H
+#define SOC_SA1100_PWER_H
+
+/*
+ * Copyright (C) 2015, Dmitry Eremin-Solenikov
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+int sa11x0_gpio_set_wake(unsigned int gpio, unsigned int on);
+int sa11x0_sc_set_wake(unsigned int irq, unsigned int on);
+
+#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8e13f3e54ec3..bc95b6a6220b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1502,11 +1502,17 @@ static int __init perf_workqueue_init(void)
 
 core_initcall(perf_workqueue_init);
 
+static inline int pmu_filter_match(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+	return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
 	return (event->cpu == -1 || event->cpu == smp_processor_id())
-	    && perf_cgroup_match(event);
+	    && perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void