Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index acb80708accdb5d9af59aedb89c9a2bfb308e990..603d661b445d4a2690c7bcb5dbd9cb6d282cc0e6 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
         select ARCH_HAVE_CUSTOM_GPIO_H
+       select ARCH_USE_CMPXCHG_LOCKREF
         select ARCH_WANT_IPC_PARSE_VERSION
         select BUILDTIME_EXTABLE_SORT if MMU
         select CLONE_BACKWARDS
@@ -51,6 +52,8 @@ config ARM
         select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
         select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
         select HAVE_PERF_EVENTS
+       select HAVE_PERF_REGS
+       select HAVE_PERF_USER_STACK_DUMP
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_SYSCALL_TRACEPOINTS
         select HAVE_UID16
@@ -481,6 +484,7 @@ config ARCH_IXP4XX
         bool "IXP4xx-based"
         depends on MMU
         select ARCH_HAS_DMA_SET_COHERENT_MASK
+       select ARCH_SUPPORTS_BIG_ENDIAN
         select ARCH_REQUIRE_GPIOLIB
         select CLKSRC_MMIO
         select CPU_XSCALE
@@ -688,7 +692,6 @@ config ARCH_SA1100
         select GENERIC_CLOCKEVENTS
         select HAVE_IDE
         select ISA
-       select NEED_MACH_GPIO_H
         select NEED_MACH_MEMORY_H
         select SPARSE_IRQ
         help
@@ -1064,11 +1067,6 @@ config IWMMXT
           Enable support for iWMMXt context switching at run time if
           running on a CPU that supports it.
  
-config XSCALE_PMU
-       bool
-       depends on CPU_XSCALE
-       default y
-
  config MULTI_IRQ_HANDLER
         bool
         help
@@ -1516,6 +1514,32 @@ config MCPM
           for (multi-)cluster based systems, such as big.LITTLE based
           systems.
  
+config BIG_LITTLE
+       bool "big.LITTLE support (Experimental)"
+       depends on CPU_V7 && SMP
+       select MCPM
+       help
+         This option enables support selections for the big.LITTLE
+         system architecture.
+
+config BL_SWITCHER
+       bool "big.LITTLE switcher support"
+       depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+       select CPU_PM
+       select ARM_CPU_SUSPEND
+       help
+         The big.LITTLE "switcher" provides the core functionality to
+         transparently handle transition between a cluster of A15's
+         and a cluster of A7's in a big.LITTLE system.
+
+config BL_SWITCHER_DUMMY_IF
+       tristate "Simple big.LITTLE switcher user interface"
+       depends on BL_SWITCHER && DEBUG_KERNEL
+       help
+         This is a simple and dummy char dev interface to control
+         the big.LITTLE switcher core code.  It is meant for
+         debugging purposes only.
+
  choice
         prompt "Memory split"
         default VMSPLIT_3G
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug

index d597c6b8488ba257fa37319d9afc7dc6362933a7..5765abf5ce84576d8de31df83d709160905d7b19 100644 (file)
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -318,6 +318,7 @@ choice
         config DEBUG_MSM_UART1
                 bool "Kernel low-level debugging messages via MSM UART1"
                 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                 help
                   Say Y here if you want the debug print routines to direct
                   their output to the first serial port on MSM devices.
@@ -325,6 +326,7 @@ choice
         config DEBUG_MSM_UART2
                 bool "Kernel low-level debugging messages via MSM UART2"
                 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                 help
                   Say Y here if you want the debug print routines to direct
                   their output to the second serial port on MSM devices.
@@ -332,6 +334,7 @@ choice
         config DEBUG_MSM_UART3
                 bool "Kernel low-level debugging messages via MSM UART3"
                 depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                 help
                   Say Y here if you want the debug print routines to direct
                   their output to the third serial port on MSM devices.
@@ -340,6 +343,7 @@ choice
                 bool "Kernel low-level debugging messages via MSM 8660 UART"
                 depends on ARCH_MSM8X60
                 select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
                 help
                   Say Y here if you want the debug print routines to direct
                   their output to the serial port on MSM 8660 devices.
@@ -348,10 +352,20 @@ choice
                 bool "Kernel low-level debugging messages via MSM 8960 UART"
                 depends on ARCH_MSM8960
                 select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
                 help
                   Say Y here if you want the debug print routines to direct
                   their output to the serial port on MSM 8960 devices.
  
+       config DEBUG_MSM8974_UART
+               bool "Kernel low-level debugging messages via MSM 8974 UART"
+               depends on ARCH_MSM8974
+               select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
+               help
+                 Say Y here if you want the debug print routines to direct
+                 their output to the serial port on MSM 8974 devices.
+
         config DEBUG_MVEBU_UART
                 bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)"
                 depends on ARCH_MVEBU
@@ -841,6 +855,20 @@ choice
                   options; the platform specific options are deprecated
                   and will be soon removed.
  
+       config DEBUG_LL_UART_EFM32
+               bool "Kernel low-level debugging via efm32 UART"
+               depends on ARCH_EFM32
+               help
+                 Say Y here if you want the debug print routines to direct
+                 their output to an UART or USART port on efm32 based
+                 machines. Use the following addresses for DEBUG_UART_PHYS:
+
+                   0x4000c000 | USART0
+                   0x4000c400 | USART1
+                   0x4000c800 | USART2
+                   0x4000e000 | UART0
+                   0x4000e400 | UART1
+
         config DEBUG_LL_UART_PL01X
                 bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART"
                 help
@@ -887,11 +915,16 @@ config DEBUG_STI_UART
         bool
         depends on ARCH_STI
  
+config DEBUG_MSM_UART
+       bool
+       depends on ARCH_MSM
+
  config DEBUG_LL_INCLUDE
         string
         default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250
         default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X
         default "debug/exynos.S" if DEBUG_EXYNOS_UART
+       default "debug/efm32.S" if DEBUG_LL_UART_EFM32
         default "debug/icedcc.S" if DEBUG_ICEDCC
         default "debug/imx.S" if DEBUG_IMX1_UART || \
                                  DEBUG_IMX25_UART || \
@@ -902,11 +935,7 @@ config DEBUG_LL_INCLUDE
                                  DEBUG_IMX53_UART ||\
                                  DEBUG_IMX6Q_UART || \
                                  DEBUG_IMX6SL_UART
-       default "debug/msm.S" if DEBUG_MSM_UART1 || \
-                                DEBUG_MSM_UART2 || \
-                                DEBUG_MSM_UART3 || \
-                                DEBUG_MSM8660_UART || \
-                                DEBUG_MSM8960_UART
+       default "debug/msm.S" if DEBUG_MSM_UART
         default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART
         default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1
         default "debug/sti.S" if DEBUG_STI_UART
@@ -959,6 +988,7 @@ config DEBUG_UART_PHYS
         default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2
         default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3
         default 0x20201000 if DEBUG_BCM2835
+       default 0x4000e400 if DEBUG_LL_UART_EFM32
         default 0x40090000 if ARCH_LPC32XX
         default 0x40100000 if DEBUG_PXA_UART1
         default 0x42000000 if ARCH_GEMINI
@@ -989,6 +1019,7 @@ config DEBUG_UART_PHYS
         default 0xfff36000 if DEBUG_HIGHBANK_UART
         default 0xfffff700 if ARCH_IOP33X
         depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
+               DEBUG_LL_UART_EFM32 || \
                 DEBUG_UART_8250 || DEBUG_UART_PL01X
  
  config DEBUG_UART_VIRT
diff --git a/arch/arm/Makefile b/arch/arm/Makefile

index 8b667132d7b419bb572549d3f5149c7f9e503860..c99b1086d83dfa8c0c407bab392ff5bb12927ab5 100644 (file)
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,6 +16,7 @@ LDFLAGS               :=
  LDFLAGS_vmlinux        :=-p --no-undefined -X
  ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
  LDFLAGS_vmlinux        += --be8
+LDFLAGS_MODULE += --be8
  endif
  
  OBJCOPYFLAGS   :=-O binary -R .comment -S
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S

index 75189f13cf54c63d0db56b98ad2c41834a30893e..066b03480b63caacc04abf1ef0ec5ccf037fa52d 100644 (file)
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -135,6 +135,7 @@ start:
                 .word   _edata                  @ zImage end address
   THUMB(                .thumb                  )
  1:
+ ARM_BE8(      setend  be )                    @ go BE8 if compiled for BE8
                 mrs     r9, cpsr
  #ifdef CONFIG_ARM_VIRT_EXT
                 bl      __hyp_stub_install      @ get into SVC mode, reversibly
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
                 mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                 orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                 orr     r0, r0, #0x0030
-#ifdef CONFIG_CPU_ENDIAN_BE8
-               orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
+ ARM_BE8(      orr     r0, r0, #1 << 25 )      @ big-endian page tables
                 bl      __common_mmu_cache_on
                 mov     r0, #0
                 mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
                 orr     r0, r0, #1 << 22        @ U (v6 unaligned access model)
                                                 @ (needed for ARM1176)
  #ifdef CONFIG_MMU
-#ifdef CONFIG_CPU_ENDIAN_BE8
-               orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
+ ARM_BE8(      orr     r0, r0, #1 << 25 )      @ big-endian page tables
                 mrcne   p15, 0, r6, c2, c0, 2   @ read ttb control reg
                 orrne   r0, r0, #1              @ MMU enabled
                 movne   r1, #0xfffffffd         @ domain 0 = client
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile

index eaa9cf4705a7179dcb7221aa5f2a29a6c1883b78..4bdc41622c36686df868e2134d6453fe60684b5b 100644 (file)
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -16,3 +16,5 @@ obj-$(CONFIG_MCPM)            += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
  AFLAGS_mcpm_head.o             := -march=armv7-a
  AFLAGS_vlock.o                 := -march=armv7-a
  obj-$(CONFIG_TI_PRIV_EDMA)     += edma.o
+obj-$(CONFIG_BL_SWITCHER)      += bL_switcher.o
+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c

new file mode 100644 (file)

index 0000000..5774b6e
--- /dev/null
+++ b/arch/arm/common/bL_switcher.c
@@ -0,0 +1,822 @@
+/*
+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright:  (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp_plat.h>
+#include <asm/cputype.h>
+#include <asm/suspend.h>
+#include <asm/mcpm.h>
+#include <asm/bL_switcher.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power_cpu_migrate.h>
+
+
+/*
+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
+ * __attribute_const__ and we don't want the compiler to assume any
+ * constness here as the value _does_ change along some code paths.
+ */
+
+static int read_mpidr(void)
+{
+       unsigned int id;
+       asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
+       return id & MPIDR_HWID_BITMASK;
+}
+
+/*
+ * Get a global nanosecond time stamp for tracing.
+ */
+static s64 get_ns(void)
+{
+       struct timespec ts;
+       getnstimeofday(&ts);
+       return timespec_to_ns(&ts);
+}
+
+/*
+ * bL switcher core code.
+ */
+
+static void bL_do_switch(void *_arg)
+{
+       unsigned ib_mpidr, ib_cpu, ib_cluster;
+       long volatile handshake, **handshake_ptr = _arg;
+
+       pr_debug("%s\n", __func__);
+
+       ib_mpidr = cpu_logical_map(smp_processor_id());
+       ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+       ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+       /* Advertise our handshake location */
+       if (handshake_ptr) {
+               handshake = 0;
+               *handshake_ptr = &handshake;
+       } else
+               handshake = -1;
+
+       /*
+        * Our state has been saved at this point.  Let's release our
+        * inbound CPU.
+        */
+       mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
+       sev();
+
+       /*
+        * From this point, we must assume that our counterpart CPU might
+        * have taken over in its parallel world already, as if execution
+        * just returned from cpu_suspend().  It is therefore important to
+        * be very careful not to make any change the other guy is not
+        * expecting.  This is why we need stack isolation.
+        *
+        * Fancy under cover tasks could be performed here.  For now
+        * we have none.
+        */
+
+       /*
+        * Let's wait until our inbound is alive.
+        */
+       while (!handshake) {
+               wfe();
+               smp_mb();
+       }
+
+       /* Let's put ourself down. */
+       mcpm_cpu_power_down();
+
+       /* should never get here */
+       BUG();
+}
+
+/*
+ * Stack isolation.  To ensure 'current' remains valid, we just use another
+ * piece of our thread's stack space which should be fairly lightly used.
+ * The selected area starts just above the thread_info structure located
+ * at the very bottom of the stack, aligned to a cache line, and indexed
+ * with the cluster number.
+ */
+#define STACK_SIZE 512
+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
+static int bL_switchpoint(unsigned long _arg)
+{
+       unsigned int mpidr = read_mpidr();
+       unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+       void *stack = current_thread_info() + 1;
+       stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
+       stack += clusterid * STACK_SIZE + STACK_SIZE;
+       call_with_stack(bL_do_switch, (void *)_arg, stack);
+       BUG();
+}
+
+/*
+ * Generic switcher interface
+ */
+
+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
+static int bL_switcher_cpu_pairing[NR_CPUS];
+
+/*
+ * bL_switch_to - Switch to a specific cluster for the current CPU
+ * @new_cluster_id: the ID of the cluster to switch to.
+ *
+ * This function must be called on the CPU to be switched.
+ * Returns 0 on success, else a negative status code.
+ */
+static int bL_switch_to(unsigned int new_cluster_id)
+{
+       unsigned int mpidr, this_cpu, that_cpu;
+       unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
+       struct completion inbound_alive;
+       struct tick_device *tdev;
+       enum clock_event_mode tdev_mode;
+       long volatile *handshake_ptr;
+       int ipi_nr, ret;
+
+       this_cpu = smp_processor_id();
+       ob_mpidr = read_mpidr();
+       ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
+       ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
+       BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
+
+       if (new_cluster_id == ob_cluster)
+               return 0;
+
+       that_cpu = bL_switcher_cpu_pairing[this_cpu];
+       ib_mpidr = cpu_logical_map(that_cpu);
+       ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+       ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+       pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
+                this_cpu, ob_mpidr, ib_mpidr);
+
+       this_cpu = smp_processor_id();
+
+       /* Close the gate for our entry vectors */
+       mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
+       mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
+
+       /* Install our "inbound alive" notifier. */
+       init_completion(&inbound_alive);
+       ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
+       ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
+       mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
+
+       /*
+        * Let's wake up the inbound CPU now in case it requires some delay
+        * to come online, but leave it gated in our entry vector code.
+        */
+       ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
+       if (ret) {
+               pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
+               return ret;
+       }
+
+       /*
+        * Raise a SGI on the inbound CPU to make sure it doesn't stall
+        * in a possible WFI, such as in bL_power_down().
+        */
+       gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
+
+       /*
+        * Wait for the inbound to come up.  This allows for other
+        * tasks to be scheduled in the mean time.
+        */
+       wait_for_completion(&inbound_alive);
+       mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
+
+       /*
+        * From this point we are entering the switch critical zone
+        * and can't take any interrupts anymore.
+        */
+       local_irq_disable();
+       local_fiq_disable();
+       trace_cpu_migrate_begin(get_ns(), ob_mpidr);
+
+       /* redirect GIC's SGIs to our counterpart */
+       gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
+
+       tdev = tick_get_device(this_cpu);
+       if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
+               tdev = NULL;
+       if (tdev) {
+               tdev_mode = tdev->evtdev->mode;
+               clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+       }
+
+       ret = cpu_pm_enter();
+
+       /* we can not tolerate errors at this point */
+       if (ret)
+               panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
+
+       /* Swap the physical CPUs in the logical map for this logical CPU. */
+       cpu_logical_map(this_cpu) = ib_mpidr;
+       cpu_logical_map(that_cpu) = ob_mpidr;
+
+       /* Let's do the actual CPU switch. */
+       ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
+       if (ret > 0)
+               panic("%s: cpu_suspend() returned %d\n", __func__, ret);
+
+       /* We are executing on the inbound CPU at this point */
+       mpidr = read_mpidr();
+       pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
+       BUG_ON(mpidr != ib_mpidr);
+
+       mcpm_cpu_powered_up();
+
+       ret = cpu_pm_exit();
+
+       if (tdev) {
+               clockevents_set_mode(tdev->evtdev, tdev_mode);
+               clockevents_program_event(tdev->evtdev,
+                                         tdev->evtdev->next_event, 1);
+       }
+
+       trace_cpu_migrate_finish(get_ns(), ib_mpidr);
+       local_fiq_enable();
+       local_irq_enable();
+
+       *handshake_ptr = 1;
+       dsb_sev();
+
+       if (ret)
+               pr_err("%s exiting with error %d\n", __func__, ret);
+       return ret;
+}
+
+struct bL_thread {
+       spinlock_t lock;
+       struct task_struct *task;
+       wait_queue_head_t wq;
+       int wanted_cluster;
+       struct completion started;
+       bL_switch_completion_handler completer;
+       void *completer_cookie;
+};
+
+static struct bL_thread bL_threads[NR_CPUS];
+
+static int bL_switcher_thread(void *arg)
+{
+       struct bL_thread *t = arg;
+       struct sched_param param = { .sched_priority = 1 };
+       int cluster;
+       bL_switch_completion_handler completer;
+       void *completer_cookie;
+
+       sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+       complete(&t->started);
+
+       do {
+               if (signal_pending(current))
+                       flush_signals(current);
+               wait_event_interruptible(t->wq,
+                               t->wanted_cluster != -1 ||
+                               kthread_should_stop());
+
+               spin_lock(&t->lock);
+               cluster = t->wanted_cluster;
+               completer = t->completer;
+               completer_cookie = t->completer_cookie;
+               t->wanted_cluster = -1;
+               t->completer = NULL;
+               spin_unlock(&t->lock);
+
+               if (cluster != -1) {
+                       bL_switch_to(cluster);
+
+                       if (completer)
+                               completer(completer_cookie);
+               }
+       } while (!kthread_should_stop());
+
+       return 0;
+}
+
+static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
+{
+       struct task_struct *task;
+
+       task = kthread_create_on_node(bL_switcher_thread, arg,
+                                     cpu_to_node(cpu), "kswitcher_%d", cpu);
+       if (!IS_ERR(task)) {
+               kthread_bind(task, cpu);
+               wake_up_process(task);
+       } else
+               pr_err("%s failed for CPU %d\n", __func__, cpu);
+       return task;
+}
+
+/*
+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
+ *      with completion notification via a callback
+ *
+ * @cpu: the CPU to switch
+ * @new_cluster_id: the ID of the cluster to switch to.
+ * @completer: switch completion callback.  if non-NULL,
+ *     @completer(@completer_cookie) will be called on completion of
+ *     the switch, in non-atomic context.
+ * @completer_cookie: opaque context argument for @completer.
+ *
+ * This function causes a cluster switch on the given CPU by waking up
+ * the appropriate switcher thread.  This function may or may not return
+ * before the switch has occurred.
+ *
+ * If a @completer callback function is supplied, it will be called when
+ * the switch is complete.  This can be used to determine asynchronously
+ * when the switch is complete, regardless of when bL_switch_request()
+ * returns.  When @completer is supplied, no new switch request is permitted
+ * for the affected CPU until after the switch is complete, and @completer
+ * has returned.
+ */
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                        bL_switch_completion_handler completer,
+                        void *completer_cookie)
+{
+       struct bL_thread *t;
+
+       if (cpu >= ARRAY_SIZE(bL_threads)) {
+               pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+               return -EINVAL;
+       }
+
+       t = &bL_threads[cpu];
+
+       if (IS_ERR(t->task))
+               return PTR_ERR(t->task);
+       if (!t->task)
+               return -ESRCH;
+
+       spin_lock(&t->lock);
+       if (t->completer) {
+               spin_unlock(&t->lock);
+               return -EBUSY;
+       }
+       t->completer = completer;
+       t->completer_cookie = completer_cookie;
+       t->wanted_cluster = new_cluster_id;
+       spin_unlock(&t->lock);
+       wake_up(&t->wq);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
+
+/*
+ * Activation and configuration code.
+ */
+
+static DEFINE_MUTEX(bL_switcher_activation_lock);
+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
+static unsigned int bL_switcher_active;
+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
+static cpumask_t bL_switcher_removed_logical_cpus;
+
+int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
+
+int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
+
+static int bL_activation_notify(unsigned long val)
+{
+       int ret;
+
+       ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
+       if (ret & NOTIFY_STOP_MASK)
+               pr_err("%s: notifier chain failed with status 0x%x\n",
+                       __func__, ret);
+       return notifier_to_errno(ret);
+}
+
+static void bL_switcher_restore_cpus(void)
+{
+       int i;
+
+       for_each_cpu(i, &bL_switcher_removed_logical_cpus)
+               cpu_up(i);
+}
+
+static int bL_switcher_halve_cpus(void)
+{
+       int i, j, cluster_0, gic_id, ret;
+       unsigned int cpu, cluster, mask;
+       cpumask_t available_cpus;
+
+       /* First pass to validate what we have */
+       mask = 0;
+       for_each_online_cpu(i) {
+               cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+               if (cluster >= 2) {
+                       pr_err("%s: only dual cluster systems are supported\n", __func__);
+                       return -EINVAL;
+               }
+               if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
+                       return -EINVAL;
+               mask |= (1 << cluster);
+       }
+       if (mask != 3) {
+               pr_err("%s: no CPU pairing possible\n", __func__);
+               return -EINVAL;
+       }
+
+       /*
+        * Now let's do the pairing.  We match each CPU with another CPU
+        * from a different cluster.  To get a uniform scheduling behavior
+        * without fiddling with CPU topology and compute capacity data,
+        * we'll use logical CPUs initially belonging to the same cluster.
+        */
+       memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
+       cpumask_copy(&available_cpus, cpu_online_mask);
+       cluster_0 = -1;
+       for_each_cpu(i, &available_cpus) {
+               int match = -1;
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+               if (cluster_0 == -1)
+                       cluster_0 = cluster;
+               if (cluster != cluster_0)
+                       continue;
+               cpumask_clear_cpu(i, &available_cpus);
+               for_each_cpu(j, &available_cpus) {
+                       cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
+                       /*
+                        * Let's remember the last match to create "odd"
+                        * pairings on purpose in order for other code not
+                        * to assume any relation between physical and
+                        * logical CPU numbers.
+                        */
+                       if (cluster != cluster_0)
+                               match = j;
+               }
+               if (match != -1) {
+                       bL_switcher_cpu_pairing[i] = match;
+                       cpumask_clear_cpu(match, &available_cpus);
+                       pr_info("CPU%d paired with CPU%d\n", i, match);
+               }
+       }
+
+       /*
+        * Now we disable the unwanted CPUs i.e. everything that has no
+        * pairing information (that includes the pairing counterparts).
+        */
+       cpumask_clear(&bL_switcher_removed_logical_cpus);
+       for_each_online_cpu(i) {
+               cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+
+               /* Let's take note of the GIC ID for this CPU */
+               gic_id = gic_get_cpu_id(i);
+               if (gic_id < 0) {
+                       pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
+                       bL_switcher_restore_cpus();
+                       return -EINVAL;
+               }
+               bL_gic_id[cpu][cluster] = gic_id;
+               pr_info("GIC ID for CPU %u cluster %u is %u\n",
+                       cpu, cluster, gic_id);
+
+               if (bL_switcher_cpu_pairing[i] != -1) {
+                       bL_switcher_cpu_original_cluster[i] = cluster;
+                       continue;
+               }
+
+               ret = cpu_down(i);
+               if (ret) {
+                       bL_switcher_restore_cpus();
+                       return ret;
+               }
+               cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
+       }
+
+       return 0;
+}
+
+/* Determine the logical CPU a given physical CPU is grouped on. */
+int bL_switcher_get_logical_index(u32 mpidr)
+{
+       int cpu;
+
+       if (!bL_switcher_active)
+               return -EUNATCH;
+
+       mpidr &= MPIDR_HWID_BITMASK;
+       for_each_online_cpu(cpu) {
+               int pairing = bL_switcher_cpu_pairing[cpu];
+               if (pairing == -1)
+                       continue;
+               if ((mpidr == cpu_logical_map(cpu)) ||
+                   (mpidr == cpu_logical_map(pairing)))
+                       return cpu;
+       }
+       return -EINVAL;
+}
+
+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
+{
+       trace_cpu_migrate_current(get_ns(), read_mpidr());
+}
+
+int bL_switcher_trace_trigger(void)
+{
+       int ret;
+
+       preempt_disable();
+
+       bL_switcher_trace_trigger_cpu(NULL);
+       ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+
+       preempt_enable();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
+
+static int bL_switcher_enable(void)
+{
+       int cpu, ret;
+
+       mutex_lock(&bL_switcher_activation_lock);
+       lock_device_hotplug();
+       if (bL_switcher_active) {
+               unlock_device_hotplug();
+               mutex_unlock(&bL_switcher_activation_lock);
+               return 0;
+       }
+
+       pr_info("big.LITTLE switcher initializing\n");
+
+       ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
+       if (ret)
+               goto error;
+
+       ret = bL_switcher_halve_cpus();
+       if (ret)
+               goto error;
+
+       bL_switcher_trace_trigger();
+
+       for_each_online_cpu(cpu) {
+               struct bL_thread *t = &bL_threads[cpu];
+               spin_lock_init(&t->lock);
+               init_waitqueue_head(&t->wq);
+               init_completion(&t->started);
+               t->wanted_cluster = -1;
+               t->task = bL_switcher_thread_create(cpu, t);
+       }
+
+       bL_switcher_active = 1;
+       bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+       pr_info("big.LITTLE switcher initialized\n");
+       goto out;
+
+error:
+       pr_warn("big.LITTLE switcher initialization failed\n");
+       bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+       unlock_device_hotplug();
+       mutex_unlock(&bL_switcher_activation_lock);
+       return ret;
+}
+
+#ifdef CONFIG_SYSFS
+
+static void bL_switcher_disable(void)
+{
+       unsigned int cpu, cluster;
+       struct bL_thread *t;
+       struct task_struct *task;
+
+       mutex_lock(&bL_switcher_activation_lock);
+       lock_device_hotplug();
+
+       if (!bL_switcher_active)
+               goto out;
+
+       if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
+               bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+               goto out;
+       }
+
+       bL_switcher_active = 0;
+
+       /*
+        * To deactivate the switcher, we must shut down the switcher
+        * threads to prevent any other requests from being accepted.
+        * Then, if the final cluster for given logical CPU is not the
+        * same as the original one, we'll recreate a switcher thread
+        * just for the purpose of switching the CPU back without any
+        * possibility for interference from external requests.
+        */
+       for_each_online_cpu(cpu) {
+               t = &bL_threads[cpu];
+               task = t->task;
+               t->task = NULL;
+               if (!task || IS_ERR(task))
+                       continue;
+               kthread_stop(task);
+               /* no more switch may happen on this CPU at this point */
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+               if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                       continue;
+               init_completion(&t->started);
+               t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
+               task = bL_switcher_thread_create(cpu, t);
+               if (!IS_ERR(task)) {
+                       wait_for_completion(&t->started);
+                       kthread_stop(task);
+                       cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+                       if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                               continue;
+               }
+               /* If execution gets here, we're in trouble. */
+               pr_crit("%s: unable to restore original cluster for CPU %d\n",
+                       __func__, cpu);
+               pr_crit("%s: CPU %d can't be restored\n",
+                       __func__, bL_switcher_cpu_pairing[cpu]);
+               cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
+                                 &bL_switcher_removed_logical_cpus);
+       }
+
+       bL_switcher_restore_cpus();
+       bL_switcher_trace_trigger();
+
+       bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+       unlock_device_hotplug();
+       mutex_unlock(&bL_switcher_activation_lock);
+}
+
+static ssize_t bL_switcher_active_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%u\n", bL_switcher_active);
+}
+
+static ssize_t bL_switcher_active_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       int ret;
+
+       switch (buf[0]) {
+       case '0':
+               bL_switcher_disable();
+               ret = 0;
+               break;
+       case '1':
+               ret = bL_switcher_enable();
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return (ret >= 0) ? count : ret;
+}
+
+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       int ret = bL_switcher_trace_trigger();
+
+       return ret ? ret : count;
+}
+
+static struct kobj_attribute bL_switcher_active_attr =
+       __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
+
+static struct kobj_attribute bL_switcher_trace_trigger_attr =
+       __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
+
+static struct attribute *bL_switcher_attrs[] = {
+       &bL_switcher_active_attr.attr,
+       &bL_switcher_trace_trigger_attr.attr,
+       NULL,
+};
+
+static struct attribute_group bL_switcher_attr_group = {
+       .attrs = bL_switcher_attrs,
+};
+
+static struct kobject *bL_switcher_kobj;
+
+static int __init bL_switcher_sysfs_init(void)
+{
+       int ret;
+
+       bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
+       if (!bL_switcher_kobj)
+               return -ENOMEM;
+       ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
+       if (ret)
+               kobject_put(bL_switcher_kobj);
+       return ret;
+}
+
+#endif  /* CONFIG_SYSFS */
+
+bool bL_switcher_get_enabled(void)
+{
+       mutex_lock(&bL_switcher_activation_lock);
+
+       return bL_switcher_active;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
+
+void bL_switcher_put_enabled(void)
+{
+       mutex_unlock(&bL_switcher_activation_lock);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
+
+/*
+ * Veto any CPU hotplug operation on those CPUs we've removed
+ * while the switcher is active.
+ * We're just not ready to deal with that given the trickery involved.
+ */
+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       if (bL_switcher_active) {
+               int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
+               switch (action & 0xf) {
+               case CPU_UP_PREPARE:
+               case CPU_DOWN_PREPARE:
+                       if (pairing == -1)
+                               return NOTIFY_BAD;
+               }
+       }
+       return NOTIFY_DONE;
+}
+
+static bool no_bL_switcher;
+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
+
+static int __init bL_switcher_init(void)
+{
+       int ret;
+
+       if (MAX_NR_CLUSTERS != 2) {
+               pr_err("%s: only dual cluster systems are supported\n", __func__);
+               return -EINVAL;
+       }
+
+       cpu_notifier(bL_switcher_hotplug_callback, 0);
+
+       if (!no_bL_switcher) {
+               ret = bL_switcher_enable();
+               if (ret)
+                       return ret;
+       }
+
+#ifdef CONFIG_SYSFS
+       ret = bL_switcher_sysfs_init();
+       if (ret)
+               pr_err("%s: unable to create sysfs entry\n", __func__);
+#endif
+
+       return 0;
+}
+
+late_initcall(bL_switcher_init);
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c

new file mode 100644 (file)

index 0000000..3f47f12
--- /dev/null
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -0,0 +1,71 @@
+/*
+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
+ *
+ * Created by: Nicolas Pitre, November 2012
+ * Copyright:  (C) 2012-2013  Linaro Limited
+ *
+ * Dummy interface to user space for debugging purpose only.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <asm/bL_switcher.h>
+
+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
+                       size_t len, loff_t *pos)
+{
+       unsigned char val[3];
+       unsigned int cpu, cluster;
+       int ret;
+
+       pr_debug("%s\n", __func__);
+
+       if (len < 3)
+               return -EINVAL;
+
+       if (copy_from_user(val, buf, 3))
+               return -EFAULT;
+
+       /* format: <cpu#>,<cluster#> */
+       if (val[0] < '0' || val[0] > '9' ||
+           val[1] != ',' ||
+           val[2] < '0' || val[2] > '1')
+               return -EINVAL;
+
+       cpu = val[0] - '0';
+       cluster = val[2] - '0';
+       ret = bL_switch_request(cpu, cluster);
+
+       return ret ? : len;
+}
+
+static const struct file_operations bL_switcher_fops = {
+       .write          = bL_switcher_write,
+       .owner  = THIS_MODULE,
+};
+
+static struct miscdevice bL_switcher_device = {
+       MISC_DYNAMIC_MINOR,
+       "b.L_switcher",
+       &bL_switcher_fops
+};
+
+static int __init bL_switcher_dummy_if_init(void)
+{
+       return misc_register(&bL_switcher_device);
+}
+
+static void __exit bL_switcher_dummy_if_exit(void)
+{
+       misc_deregister(&bL_switcher_device);
+}
+
+module_init(bL_switcher_dummy_if_init);
+module_exit(bL_switcher_dummy_if_exit);
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c

index 990250965f2cfb4e4e3a984678fcf62eedbcdb8d..26020a03f659f2d78fe37f2dca7f3e0a0eb573ca 100644 (file)
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
         sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
  }
  
+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
+
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                        unsigned long poke_phys_addr, unsigned long poke_val)
+{
+       unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
+       poke[0] = poke_phys_addr;
+       poke[1] = poke_val;
+       __cpuc_flush_dcache_area((void *)poke, 8);
+       outer_clean_range(__pa(poke), __pa(poke + 2));
+}
+
  static const struct mcpm_platform_ops *platform_ops;
  
  int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
@@ -90,6 +102,21 @@ void mcpm_cpu_power_down(void)
         BUG();
  }
  
+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster)
+{
+       int ret;
+
+       if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish))
+               return -EUNATCH;
+
+       ret = platform_ops->power_down_finish(cpu, cluster);
+       if (ret)
+               pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n",
+                       __func__, cpu, cluster, ret);
+
+       return ret;
+}
+
  void mcpm_cpu_suspend(u64 expected_residency)
  {
         phys_reset_t phys_reset;
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S

index 39c96df3477a41549d71e1a18733dd85f4a168df..e02db4b81a66942d307cced79e0cd6c6ae9733b5 100644 (file)
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -15,6 +15,7 @@
  
  #include <linux/linkage.h>
  #include <asm/mcpm.h>
+#include <asm/assembler.h>
  
  #include "vlock.h"
  
@@ -47,6 +48,7 @@
  
  ENTRY(mcpm_entry_point)
  
+ ARM_BE8(setend        be)
   THUMB(        adr     r12, BSYM(1f)   )
   THUMB(        bx      r12             )
   THUMB(        .thumb                  )
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
          * position independent way.
          */
         adr     r5, 3f
-       ldmia   r5, {r6, r7, r8, r11}
+       ldmia   r5, {r0, r6, r7, r8, r11}
+       add     r0, r5, r0                      @ r0 = mcpm_entry_early_pokes
         add     r6, r5, r6                      @ r6 = mcpm_entry_vectors
         ldr     r7, [r5, r7]                    @ r7 = mcpm_power_up_setup_phys
         add     r8, r5, r8                      @ r8 = mcpm_sync
         add     r11, r5, r11                    @ r11 = first_man_locks
  
+       @ Perform an early poke, if any
+       add     r0, r0, r4, lsl #3
+       ldmia   r0, {r0, r1}
+       teq     r0, #0
+       strne   r1, [r0]
+
         mov     r0, #MCPM_SYNC_CLUSTER_SIZE
         mla     r8, r0, r10, r8                 @ r8 = sync cluster base
  
@@ -195,7 +204,8 @@ mcpm_entry_gated:
  
         .align  2
  
-3:     .word   mcpm_entry_vectors - .
+3:     .word   mcpm_entry_early_pokes - .
+       .word   mcpm_entry_vectors - 3b
         .word   mcpm_power_up_setup_phys - 3b
         .word   mcpm_sync - 3b
         .word   first_man_locks - 3b
@@ -214,6 +224,10 @@ first_man_locks:
  ENTRY(mcpm_entry_vectors)
         .space  4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
  
+       .type   mcpm_entry_early_pokes, #object
+ENTRY(mcpm_entry_early_pokes)
+       .space  8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
         .type   mcpm_power_up_setup_phys, #object
  ENTRY(mcpm_power_up_setup_phys)
         .space  4               @ set by mcpm_sync_init()
diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c

index 1bc34c7567fdf890a5eea685c04d70f7de89ffbf..177251a4dd9aff7a8ba616d1ed04796e5c49cb88 100644 (file)
--- a/arch/arm/common/mcpm_platsmp.c
+++ b/arch/arm/common/mcpm_platsmp.c
@@ -19,14 +19,23 @@
  #include <asm/smp.h>
  #include <asm/smp_plat.h>
  
+static void cpu_to_pcpu(unsigned int cpu,
+                       unsigned int *pcpu, unsigned int *pcluster)
+{
+       unsigned int mpidr;
+
+       mpidr = cpu_logical_map(cpu);
+       *pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+       *pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+}
+
  static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
  {
-       unsigned int mpidr, pcpu, pcluster, ret;
+       unsigned int pcpu, pcluster, ret;
         extern void secondary_startup(void);
  
-       mpidr = cpu_logical_map(cpu);
-       pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-       pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+       cpu_to_pcpu(cpu, &pcpu, &pcluster);
+
         pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n",
                  __func__, cpu, pcpu, pcluster);
  
@@ -47,6 +56,15 @@ static void mcpm_secondary_init(unsigned int cpu)
  
  #ifdef CONFIG_HOTPLUG_CPU
  
+static int mcpm_cpu_kill(unsigned int cpu)
+{
+       unsigned int pcpu, pcluster;
+
+       cpu_to_pcpu(cpu, &pcpu, &pcluster);
+
+       return !mcpm_cpu_power_down_finish(pcpu, pcluster);
+}
+
  static int mcpm_cpu_disable(unsigned int cpu)
  {
         /*
@@ -73,6 +91,7 @@ static struct smp_operations __initdata mcpm_smp_ops = {
         .smp_boot_secondary     = mcpm_boot_secondary,
         .smp_secondary_init     = mcpm_secondary_init,
  #ifdef CONFIG_HOTPLUG_CPU
+       .cpu_kill               = mcpm_cpu_kill,
         .cpu_disable            = mcpm_cpu_disable,
         .cpu_die                = mcpm_cpu_die,
  #endif
diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c

index e901d0f3e0bbcd735f5cf7e62bd653536ff8aa71..ce922d0ea7aa85daa59c408ac5cd79beab5459a6 100644 (file)
--- a/arch/arm/common/timer-sp.c
+++ b/arch/arm/common/timer-sp.c
@@ -175,7 +175,7 @@ static struct clock_event_device sp804_clockevent = {
  
  static struct irqaction sp804_timer_irq = {
         .name           = "timer",
-       .flags          = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
+       .flags          = IRQF_TIMER | IRQF_IRQPOLL,
         .handler        = sp804_timer_interrupt,
         .dev_id         = &sp804_clockevent,
  };
diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig

index 317960f1248893d6200650ec9c01e8132b627c16..0142ec37e0be26d8c1480aa4929a722896b3b397 100644 (file)
--- a/arch/arm/configs/h3600_defconfig
+++ b/arch/arm/configs/h3600_defconfig
@@ -1,5 +1,6 @@
-CONFIG_EXPERIMENTAL=y
  CONFIG_SYSVIPC=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
  CONFIG_LOG_BUF_SHIFT=14
  CONFIG_BLK_DEV_INITRD=y
  CONFIG_MODULES=y
@@ -11,11 +12,11 @@ CONFIG_ARCH_SA1100=y
  CONFIG_SA1100_H3600=y
  CONFIG_PCCARD=y
  CONFIG_PCMCIA_SA1100=y
+CONFIG_PREEMPT=y
  CONFIG_ZBOOT_ROM_TEXT=0x0
  CONFIG_ZBOOT_ROM_BSS=0x0
  # CONFIG_CPU_FREQ_STAT is not set
  CONFIG_FPE_NWFPE=y
-CONFIG_PM=y
  CONFIG_NET=y
  CONFIG_UNIX=y
  CONFIG_INET=y
@@ -24,13 +25,10 @@ CONFIG_IRDA=m
  CONFIG_IRLAN=m
  CONFIG_IRNET=m
  CONFIG_IRCOMM=m
-CONFIG_SA1100_FIR=m
  # CONFIG_WIRELESS is not set
  CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
  CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
  CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
  CONFIG_MTD_BLOCK=y
  CONFIG_MTD_CFI=y
  CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -41,19 +39,15 @@ CONFIG_MTD_SA1100=y
  CONFIG_BLK_DEV_LOOP=m
  CONFIG_BLK_DEV_RAM=y
  CONFIG_BLK_DEV_RAM_SIZE=8192
-# CONFIG_MISC_DEVICES is not set
  CONFIG_IDE=y
  CONFIG_BLK_DEV_IDECS=y
  CONFIG_NETDEVICES=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
-# CONFIG_WLAN is not set
-CONFIG_NET_PCMCIA=y
  CONFIG_PCMCIA_PCNET=y
  CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_DEFLATE=m
  CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_ASYNC=m
+# CONFIG_WLAN is not set
  # CONFIG_KEYBOARD_ATKBD is not set
  CONFIG_KEYBOARD_GPIO=y
  # CONFIG_INPUT_MOUSE is not set
@@ -64,8 +58,6 @@ CONFIG_SERIAL_SA1100_CONSOLE=y
  # CONFIG_HWMON is not set
  CONFIG_FB=y
  CONFIG_FB_SA1100=y
-# CONFIG_VGA_CONSOLE is not set
-# CONFIG_HID_SUPPORT is not set
  # CONFIG_USB_SUPPORT is not set
  CONFIG_EXT2_FS=y
  CONFIG_MSDOS_FS=m
@@ -74,6 +66,4 @@ CONFIG_JFFS2_FS=y
  CONFIG_CRAMFS=m
  CONFIG_NFS_FS=y
  CONFIG_NFSD=m
-CONFIG_SMB_FS=m
  CONFIG_NLS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore

new file mode 100644 (file)

index 0000000..6231d36
--- /dev/null
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
+aesbs-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile

index a2c83851bc90a29f5f1d06415cb4a0db4dd726e1..81cda39860c5c7ad90a6710727011ec79296e5d8 100644 (file)
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,17 @@
  #
  
  obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
  
-aes-arm-y  := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-y      := aes-armv4.o aes_glue.o
+aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
+sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+       $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c

index 59f7877ead6ac9ee3f8a31b43c6e0458de26cd8f..3003fa1f6fb4b9395c77340fbf83011b5cb0e419 100644 (file)
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
  #include <linux/crypto.h>
  #include <crypto/aes.h>
  
-#define AES_MAXNR 14
+#include "aes_glue.h"
  
-typedef struct {
-       unsigned int rd_key[4 *(AES_MAXNR + 1)];
-       int rounds;
-} AES_KEY;
-
-struct AES_CTX {
-       AES_KEY enc_key;
-       AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+EXPORT_SYMBOL(AES_encrypt);
+EXPORT_SYMBOL(AES_decrypt);
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
  
  static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
  {
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
                 .cipher = {
                         .cia_min_keysize        = AES_MIN_KEY_SIZE,
                         .cia_max_keysize        = AES_MAX_KEY_SIZE,
-                       .cia_setkey                     = aes_set_key,
+                       .cia_setkey             = aes_set_key,
                         .cia_encrypt            = aes_encrypt,
                         .cia_decrypt            = aes_decrypt
                 }
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h

new file mode 100644 (file)

index 0000000..cca3e51
--- /dev/null
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
+
+#define AES_MAXNR 14
+
+struct AES_KEY {
+       unsigned int rd_key[4 * (AES_MAXNR + 1)];
+       int rounds;
+};
+
+struct AES_CTX {
+       struct AES_KEY enc_key;
+       struct AES_KEY dec_key;
+};
+
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+                                          const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+                                          const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped

new file mode 100644 (file)

index 0000000..64205d4
--- /dev/null
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt      19.5 cycles per byte processed with 128-bit key
+@ decrypt      22.1 cycles per byte processed with 128-bit key
+@ key conv.    440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@                                              <appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@                                      <ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax        unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu   neon
+
+.type  _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+       adr     r6,_bsaes_decrypt8
+       vldmia  r4!, {q9}               @ round 0 key
+       add     r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+       vldmia  r6!, {q8}               @ .LM0ISR
+       veor    q10, q0, q9     @ xor with round0 key
+       veor    q11, q1, q9
+        vtbl.8 d0, {q10}, d16
+        vtbl.8 d1, {q10}, d17
+       veor    q12, q2, q9
+        vtbl.8 d2, {q11}, d16
+        vtbl.8 d3, {q11}, d17
+       veor    q13, q3, q9
+        vtbl.8 d4, {q12}, d16
+        vtbl.8 d5, {q12}, d17
+       veor    q14, q4, q9
+        vtbl.8 d6, {q13}, d16
+        vtbl.8 d7, {q13}, d17
+       veor    q15, q5, q9
+        vtbl.8 d8, {q14}, d16
+        vtbl.8 d9, {q14}, d17
+       veor    q10, q6, q9
+        vtbl.8 d10, {q15}, d16
+        vtbl.8 d11, {q15}, d17
+       veor    q11, q7, q9
+        vtbl.8 d12, {q10}, d16
+        vtbl.8 d13, {q10}, d17
+        vtbl.8 d14, {q11}, d16
+        vtbl.8 d15, {q11}, d17
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q4, #1
+       veor            q10, q10, q7
+        veor           q11, q11, q5
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #1
+        veor           q5, q5, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q3
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q5, #2
+        vshr.u64       q11, q4, #2
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q5, q5, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q3
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q3, #4
+        vshr.u64       q11, q2, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #4
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q4
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       sub     r5,r5,#1
+       b       .Ldec_sbox
+.align 4
+.Ldec_loop:
+       vldmia  r4!, {q8-q11}
+       veor    q8, q8, q0
+       veor    q9, q9, q1
+       vtbl.8  d0, {q8}, d24
+       vtbl.8  d1, {q8}, d25
+       vldmia  r4!, {q8}
+       veor    q10, q10, q2
+       vtbl.8  d2, {q9}, d24
+       vtbl.8  d3, {q9}, d25
+       vldmia  r4!, {q9}
+       veor    q11, q11, q3
+       vtbl.8  d4, {q10}, d24
+       vtbl.8  d5, {q10}, d25
+       vldmia  r4!, {q10}
+       vtbl.8  d6, {q11}, d24
+       vtbl.8  d7, {q11}, d25
+       vldmia  r4!, {q11}
+       veor    q8, q8, q4
+       veor    q9, q9, q5
+       vtbl.8  d8, {q8}, d24
+       vtbl.8  d9, {q8}, d25
+       veor    q10, q10, q6
+       vtbl.8  d10, {q9}, d24
+       vtbl.8  d11, {q9}, d25
+       veor    q11, q11, q7
+       vtbl.8  d12, {q10}, d24
+       vtbl.8  d13, {q10}, d25
+       vtbl.8  d14, {q11}, d24
+       vtbl.8  d15, {q11}, d25
+.Ldec_sbox:
+        veor   q1, q1, q4
+       veor    q3, q3, q4
+
+       veor    q4, q4, q7
+        veor   q1, q1, q6
+       veor    q2, q2, q7
+       veor    q6, q6, q4
+
+       veor    q0, q0, q1
+       veor    q2, q2, q5
+        veor   q7, q7, q6
+       veor    q3, q3, q0
+       veor    q5, q5, q0
+       veor    q1, q1, q3
+       veor    q11, q3, q0
+       veor    q10, q7, q4
+       veor    q9, q1, q6
+       veor    q13, q4, q0
+        vmov   q8, q10
+       veor    q12, q5, q2
+
+       vorr    q10, q10, q9
+       veor    q15, q11, q8
+       vand    q14, q11, q12
+       vorr    q11, q11, q12
+       veor    q12, q12, q9
+       vand    q8, q8, q9
+       veor    q9, q6, q2
+       vand    q15, q15, q12
+       vand    q13, q13, q9
+       veor    q9, q3, q7
+       veor    q12, q1, q5
+       veor    q11, q11, q13
+       veor    q10, q10, q13
+       vand    q13, q9, q12
+       vorr    q9, q9, q12
+       veor    q11, q11, q15
+       veor    q8, q8, q13
+       veor    q10, q10, q14
+       veor    q9, q9, q15
+       veor    q8, q8, q14
+       vand    q12, q4, q6
+       veor    q9, q9, q14
+       vand    q13, q0, q2
+       vand    q14, q7, q1
+       vorr    q15, q3, q5
+       veor    q11, q11, q12
+       veor    q9, q9, q14
+       veor    q8, q8, q15
+       veor    q10, q10, q13
+
+       @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+
+       @ new smaller inversion
+
+       vand    q14, q11, q9
+       vmov    q12, q8
+
+       veor    q13, q10, q14
+       veor    q15, q8, q14
+       veor    q14, q8, q14    @ q14=q15
+
+       vbsl    q13, q9, q8
+       vbsl    q15, q11, q10
+       veor    q11, q11, q10
+
+       vbsl    q12, q13, q14
+       vbsl    q8, q14, q13
+
+       vand    q14, q12, q15
+       veor    q9, q9, q8
+
+       veor    q14, q14, q11
+       veor    q12, q5, q2
+       veor    q8, q1, q6
+       veor    q10, q15, q14
+       vand    q10, q10, q5
+       veor    q5, q5, q1
+       vand    q11, q1, q15
+       vand    q5, q5, q14
+       veor    q1, q11, q10
+       veor    q5, q5, q11
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q2
+       veor    q12, q12, q8
+        veor   q2, q2, q6
+       vand    q8, q8, q15
+        vand   q6, q6, q13
+       vand    q12, q12, q14
+        vand   q2, q2, q9
+       veor    q8, q8, q12
+        veor   q2, q2, q6
+       veor    q12, q12, q11
+        veor   q6, q6, q10
+       veor    q5, q5, q12
+       veor    q2, q2, q12
+       veor    q1, q1, q8
+       veor    q6, q6, q8
+
+       veor    q12, q3, q0
+       veor    q8, q7, q4
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q0
+       veor    q12, q12, q8
+        veor   q0, q0, q4
+       vand    q8, q8, q15
+        vand   q4, q4, q13
+       vand    q12, q12, q14
+        vand   q0, q0, q9
+       veor    q8, q8, q12
+        veor   q0, q0, q4
+       veor    q12, q12, q11
+        veor   q4, q4, q10
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q10, q15, q14
+       vand    q10, q10, q3
+       veor    q3, q3, q7
+       vand    q11, q7, q15
+       vand    q3, q3, q14
+       veor    q7, q11, q10
+       veor    q3, q3, q11
+       veor    q3, q3, q12
+       veor    q0, q0, q12
+       veor    q7, q7, q8
+       veor    q4, q4, q8
+       veor    q1, q1, q7
+       veor    q6, q6, q5
+
+       veor    q4, q4, q1
+       veor    q2, q2, q7
+       veor    q5, q5, q7
+       veor    q4, q4, q2
+        veor   q7, q7, q0
+       veor    q4, q4, q5
+        veor   q3, q3, q6
+        veor   q6, q6, q1
+       veor    q3, q3, q4
+
+       veor    q4, q4, q0
+       veor    q7, q7, q3
+       subs    r5,r5,#1
+       bcc     .Ldec_done
+       @ multiplication by 0x05-0x00-0x04-0x00
+       vext.8  q8, q0, q0, #8
+       vext.8  q14, q3, q3, #8
+       vext.8  q15, q5, q5, #8
+       veor    q8, q8, q0
+       vext.8  q9, q1, q1, #8
+       veor    q14, q14, q3
+       vext.8  q10, q6, q6, #8
+       veor    q15, q15, q5
+       vext.8  q11, q4, q4, #8
+       veor    q9, q9, q1
+       vext.8  q12, q2, q2, #8
+       veor    q10, q10, q6
+       vext.8  q13, q7, q7, #8
+       veor    q11, q11, q4
+       veor    q12, q12, q2
+       veor    q13, q13, q7
+
+        veor   q0, q0, q14
+        veor   q1, q1, q14
+        veor   q6, q6, q8
+        veor   q2, q2, q10
+        veor   q4, q4, q9
+        veor   q1, q1, q15
+        veor   q6, q6, q15
+        veor   q2, q2, q14
+        veor   q7, q7, q11
+        veor   q4, q4, q14
+        veor   q3, q3, q12
+        veor   q2, q2, q15
+        veor   q7, q7, q15
+        veor   q5, q5, q13
+       vext.8  q8, q0, q0, #12 @ x0 <<< 32
+       vext.8  q9, q1, q1, #12
+        veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+       vext.8  q10, q6, q6, #12
+        veor   q1, q1, q9
+       vext.8  q11, q4, q4, #12
+        veor   q6, q6, q10
+       vext.8  q12, q2, q2, #12
+        veor   q4, q4, q11
+       vext.8  q13, q7, q7, #12
+        veor   q2, q2, q12
+       vext.8  q14, q3, q3, #12
+        veor   q7, q7, q13
+       vext.8  q15, q5, q5, #12
+        veor   q3, q3, q14
+
+       veor    q9, q9, q0
+        veor   q5, q5, q15
+        vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    q10, q10, q1
+       veor    q8, q8, q5
+       veor    q9, q9, q5
+        vext.8 q1, q1, q1, #8
+       veor    q13, q13, q2
+        veor   q0, q0, q8
+       veor    q14, q14, q7
+        veor   q1, q1, q9
+        vext.8 q8, q2, q2, #8
+       veor    q12, q12, q4
+        vext.8 q9, q7, q7, #8
+       veor    q15, q15, q3
+        vext.8 q2, q4, q4, #8
+       veor    q11, q11, q6
+        vext.8 q7, q5, q5, #8
+       veor    q12, q12, q5
+        vext.8 q4, q3, q3, #8
+       veor    q11, q11, q5
+        vext.8 q3, q6, q6, #8
+       veor    q5, q9, q13
+       veor    q11, q11, q2
+       veor    q7, q7, q15
+       veor    q6, q4, q14
+       veor    q4, q8, q12
+       veor    q2, q3, q10
+       vmov    q3, q11
+        @ vmov q5, q9
+       vldmia  r6, {q12}               @ .LISR
+       ite     eq                              @ Thumb2 thing, sanity check in ARM
+       addeq   r6,r6,#0x10
+       bne     .Ldec_loop
+       vldmia  r6, {q12}               @ .LISRM0
+       b       .Ldec_loop
+.align 4
+.Ldec_done:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q3, #1
+        vshr.u64       q11, q2, #1
+       veor            q10, q10, q5
+        veor           q11, q11, q7
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #1
+        veor           q7, q7, q11
+        vshl.u64       q11, q11, #1
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q4
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q4, q4, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q7, #2
+        vshr.u64       q11, q2, #2
+       veor            q10, q10, q5
+        veor           q11, q11, q3
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #2
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #2
+       veor            q7, q7, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q4
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q4, q4, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q4, #4
+        vshr.u64       q11, q6, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q3
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #4
+       veor            q4, q4, q10
+        veor           q6, q6, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q2
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vldmia  r4, {q8}                        @ last round key
+       veor    q6, q6, q8
+       veor    q4, q4, q8
+       veor    q2, q2, q8
+       veor    q7, q7, q8
+       veor    q3, q3, q8
+       veor    q5, q5, q8
+       veor    q0, q0, q8
+       veor    q1, q1, q8
+       bx      lr
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type  _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:       @ InvShiftRows constants
+       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:                @ ShiftRows constants
+       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+       .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 6
+.size  _bsaes_const,.-_bsaes_const
+
+.type  _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+       adr     r6,_bsaes_encrypt8
+       vldmia  r4!, {q9}               @ round 0 key
+       sub     r6,r6,#_bsaes_encrypt8-.LM0SR
+
+       vldmia  r6!, {q8}               @ .LM0SR
+_bsaes_encrypt8_alt:
+       veor    q10, q0, q9     @ xor with round0 key
+       veor    q11, q1, q9
+        vtbl.8 d0, {q10}, d16
+        vtbl.8 d1, {q10}, d17
+       veor    q12, q2, q9
+        vtbl.8 d2, {q11}, d16
+        vtbl.8 d3, {q11}, d17
+       veor    q13, q3, q9
+        vtbl.8 d4, {q12}, d16
+        vtbl.8 d5, {q12}, d17
+       veor    q14, q4, q9
+        vtbl.8 d6, {q13}, d16
+        vtbl.8 d7, {q13}, d17
+       veor    q15, q5, q9
+        vtbl.8 d8, {q14}, d16
+        vtbl.8 d9, {q14}, d17
+       veor    q10, q6, q9
+        vtbl.8 d10, {q15}, d16
+        vtbl.8 d11, {q15}, d17
+       veor    q11, q7, q9
+        vtbl.8 d12, {q10}, d16
+        vtbl.8 d13, {q10}, d17
+        vtbl.8 d14, {q11}, d16
+        vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q4, #1
+       veor            q10, q10, q7
+        veor           q11, q11, q5
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #1
+        veor           q5, q5, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q3
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q5, #2
+        vshr.u64       q11, q4, #2
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q5, q5, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q3
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q3, #4
+        vshr.u64       q11, q2, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #4
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q4
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       sub     r5,r5,#1
+       b       .Lenc_sbox
+.align 4
+.Lenc_loop:
+       vldmia  r4!, {q8-q11}
+       veor    q8, q8, q0
+       veor    q9, q9, q1
+       vtbl.8  d0, {q8}, d24
+       vtbl.8  d1, {q8}, d25
+       vldmia  r4!, {q8}
+       veor    q10, q10, q2
+       vtbl.8  d2, {q9}, d24
+       vtbl.8  d3, {q9}, d25
+       vldmia  r4!, {q9}
+       veor    q11, q11, q3
+       vtbl.8  d4, {q10}, d24
+       vtbl.8  d5, {q10}, d25
+       vldmia  r4!, {q10}
+       vtbl.8  d6, {q11}, d24
+       vtbl.8  d7, {q11}, d25
+       vldmia  r4!, {q11}
+       veor    q8, q8, q4
+       veor    q9, q9, q5
+       vtbl.8  d8, {q8}, d24
+       vtbl.8  d9, {q8}, d25
+       veor    q10, q10, q6
+       vtbl.8  d10, {q9}, d24
+       vtbl.8  d11, {q9}, d25
+       veor    q11, q11, q7
+       vtbl.8  d12, {q10}, d24
+       vtbl.8  d13, {q10}, d25
+       vtbl.8  d14, {q11}, d24
+       vtbl.8  d15, {q11}, d25
+.Lenc_sbox:
+       veor    q2, q2, q1
+       veor    q5, q5, q6
+       veor    q3, q3, q0
+       veor    q6, q6, q2
+       veor    q5, q5, q0
+
+       veor    q6, q6, q3
+       veor    q3, q3, q7
+       veor    q7, q7, q5
+       veor    q3, q3, q4
+       veor    q4, q4, q5
+
+       veor    q2, q2, q7
+       veor    q3, q3, q1
+       veor    q1, q1, q5
+       veor    q11, q7, q4
+       veor    q10, q1, q2
+       veor    q9, q5, q3
+       veor    q13, q2, q4
+        vmov   q8, q10
+       veor    q12, q6, q0
+
+       vorr    q10, q10, q9
+       veor    q15, q11, q8
+       vand    q14, q11, q12
+       vorr    q11, q11, q12
+       veor    q12, q12, q9
+       vand    q8, q8, q9
+       veor    q9, q3, q0
+       vand    q15, q15, q12
+       vand    q13, q13, q9
+       veor    q9, q7, q1
+       veor    q12, q5, q6
+       veor    q11, q11, q13
+       veor    q10, q10, q13
+       vand    q13, q9, q12
+       vorr    q9, q9, q12
+       veor    q11, q11, q15
+       veor    q8, q8, q13
+       veor    q10, q10, q14
+       veor    q9, q9, q15
+       veor    q8, q8, q14
+       vand    q12, q2, q3
+       veor    q9, q9, q14
+       vand    q13, q4, q0
+       vand    q14, q1, q5
+       vorr    q15, q7, q6
+       veor    q11, q11, q12
+       veor    q9, q9, q14
+       veor    q8, q8, q15
+       veor    q10, q10, q13
+
+       @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+
+       @ new smaller inversion
+
+       vand    q14, q11, q9
+       vmov    q12, q8
+
+       veor    q13, q10, q14
+       veor    q15, q8, q14
+       veor    q14, q8, q14    @ q14=q15
+
+       vbsl    q13, q9, q8
+       vbsl    q15, q11, q10
+       veor    q11, q11, q10
+
+       vbsl    q12, q13, q14
+       vbsl    q8, q14, q13
+
+       vand    q14, q12, q15
+       veor    q9, q9, q8
+
+       veor    q14, q14, q11
+       veor    q12, q6, q0
+       veor    q8, q5, q3
+       veor    q10, q15, q14
+       vand    q10, q10, q6
+       veor    q6, q6, q5
+       vand    q11, q5, q15
+       vand    q6, q6, q14
+       veor    q5, q11, q10
+       veor    q6, q6, q11
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q0
+       veor    q12, q12, q8
+        veor   q0, q0, q3
+       vand    q8, q8, q15
+        vand   q3, q3, q13
+       vand    q12, q12, q14
+        vand   q0, q0, q9
+       veor    q8, q8, q12
+        veor   q0, q0, q3
+       veor    q12, q12, q11
+        veor   q3, q3, q10
+       veor    q6, q6, q12
+       veor    q0, q0, q12
+       veor    q5, q5, q8
+       veor    q3, q3, q8
+
+       veor    q12, q7, q4
+       veor    q8, q1, q2
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q4
+       veor    q12, q12, q8
+        veor   q4, q4, q2
+       vand    q8, q8, q15
+        vand   q2, q2, q13
+       vand    q12, q12, q14
+        vand   q4, q4, q9
+       veor    q8, q8, q12
+        veor   q4, q4, q2
+       veor    q12, q12, q11
+        veor   q2, q2, q10
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q10, q15, q14
+       vand    q10, q10, q7
+       veor    q7, q7, q1
+       vand    q11, q1, q15
+       vand    q7, q7, q14
+       veor    q1, q11, q10
+       veor    q7, q7, q11
+       veor    q7, q7, q12
+       veor    q4, q4, q12
+       veor    q1, q1, q8
+       veor    q2, q2, q8
+       veor    q7, q7, q0
+       veor    q1, q1, q6
+       veor    q6, q6, q0
+       veor    q4, q4, q7
+       veor    q0, q0, q1
+
+       veor    q1, q1, q5
+       veor    q5, q5, q2
+       veor    q2, q2, q3
+       veor    q3, q3, q5
+       veor    q4, q4, q5
+
+       veor    q6, q6, q3
+       subs    r5,r5,#1
+       bcc     .Lenc_done
+       vext.8  q8, q0, q0, #12 @ x0 <<< 32
+       vext.8  q9, q1, q1, #12
+        veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+       vext.8  q10, q4, q4, #12
+        veor   q1, q1, q9
+       vext.8  q11, q6, q6, #12
+        veor   q4, q4, q10
+       vext.8  q12, q3, q3, #12
+        veor   q6, q6, q11
+       vext.8  q13, q7, q7, #12
+        veor   q3, q3, q12
+       vext.8  q14, q2, q2, #12
+        veor   q7, q7, q13
+       vext.8  q15, q5, q5, #12
+        veor   q2, q2, q14
+
+       veor    q9, q9, q0
+        veor   q5, q5, q15
+        vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    q10, q10, q1
+       veor    q8, q8, q5
+       veor    q9, q9, q5
+        vext.8 q1, q1, q1, #8
+       veor    q13, q13, q3
+        veor   q0, q0, q8
+       veor    q14, q14, q7
+        veor   q1, q1, q9
+        vext.8 q8, q3, q3, #8
+       veor    q12, q12, q6
+        vext.8 q9, q7, q7, #8
+       veor    q15, q15, q2
+        vext.8 q3, q6, q6, #8
+       veor    q11, q11, q4
+        vext.8 q7, q5, q5, #8
+       veor    q12, q12, q5
+        vext.8 q6, q2, q2, #8
+       veor    q11, q11, q5
+        vext.8 q2, q4, q4, #8
+       veor    q5, q9, q13
+       veor    q4, q8, q12
+       veor    q3, q3, q11
+       veor    q7, q7, q15
+       veor    q6, q6, q14
+        @ vmov q4, q8
+       veor    q2, q2, q10
+        @ vmov q5, q9
+       vldmia  r6, {q12}               @ .LSR
+       ite     eq                              @ Thumb2 thing, samity check in ARM
+       addeq   r6,r6,#0x10
+       bne     .Lenc_loop
+       vldmia  r6, {q12}               @ .LSRM0
+       b       .Lenc_loop
+.align 4
+.Lenc_done:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q3, #1
+       veor            q10, q10, q5
+        veor           q11, q11, q7
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #1
+        veor           q7, q7, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q3, q3, q11
+       vshr.u64        q10, q4, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q6
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q6, q6, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q4, q4, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q7, #2
+        vshr.u64       q11, q3, #2
+       veor            q10, q10, q5
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q7, q7, q10
+        veor           q3, q3, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q6
+        veor           q11, q11, q4
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q6, q6, q10
+       vshl.u64        q10, q10, #2
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q6, #4
+        vshr.u64       q11, q4, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q2
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #4
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q3
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vldmia  r4, {q8}                        @ last round key
+       veor    q4, q4, q8
+       veor    q6, q6, q8
+       veor    q3, q3, q8
+       veor    q7, q7, q8
+       veor    q2, q2, q8
+       veor    q5, q5, q8
+       veor    q0, q0, q8
+       veor    q1, q1, q8
+       bx      lr
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+.type  _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+       adr     r6,_bsaes_key_convert
+       vld1.8  {q7},  [r4]!            @ load round 0 key
+       sub     r6,r6,#_bsaes_key_convert-.LM0
+       vld1.8  {q15}, [r4]!            @ load round 1 key
+
+       vmov.i8 q8,  #0x01                      @ bit masks
+       vmov.i8 q9,  #0x02
+       vmov.i8 q10, #0x04
+       vmov.i8 q11, #0x08
+       vmov.i8 q12, #0x10
+       vmov.i8 q13, #0x20
+       vldmia  r6, {q14}               @ .LM0
+
+#ifdef __ARMEL__
+       vrev32.8        q7,  q7
+       vrev32.8        q15, q15
+#endif
+       sub     r5,r5,#1
+       vstmia  r12!, {q7}              @ save round 0 key
+       b       .Lkey_loop
+
+.align 4
+.Lkey_loop:
+       vtbl.8  d14,{q15},d28
+       vtbl.8  d15,{q15},d29
+       vmov.i8 q6,  #0x40
+       vmov.i8 q15, #0x80
+
+       vtst.8  q0, q7, q8
+       vtst.8  q1, q7, q9
+       vtst.8  q2, q7, q10
+       vtst.8  q3, q7, q11
+       vtst.8  q4, q7, q12
+       vtst.8  q5, q7, q13
+       vtst.8  q6, q7, q6
+       vtst.8  q7, q7, q15
+       vld1.8  {q15}, [r4]!            @ load next round key
+       vmvn    q0, q0          @ "pnot"
+       vmvn    q1, q1
+       vmvn    q5, q5
+       vmvn    q6, q6
+#ifdef __ARMEL__
+       vrev32.8        q15, q15
+#endif
+       subs    r5,r5,#1
+       vstmia  r12!,{q0-q7}            @ write bit-sliced round key
+       bne     .Lkey_loop
+
+       vmov.i8 q7,#0x63                        @ compose .L63
+       @ don't save last round key
+       bx      lr
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global        bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef        __KERNEL__
+       cmp     r2, #128
+#ifndef        __thumb__
+       blo     AES_cbc_encrypt
+#else
+       bhs     1f
+       b       AES_cbc_encrypt
+1:
+#endif
+#endif
+
+       @ it is up to the caller to make sure we are called with enc == 0
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     r8, [ip]                        @ IV is 1st arg on the stack
+       mov     r2, r2, lsr#4           @ len in 16 byte blocks
+       sub     sp, #0x10                       @ scratch space to carry over the IV
+       mov     r9, sp                          @ save sp
+
+       ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+       add     r12, #96                        @ sifze of bit-slices key schedule
+
+       @ populate the key schedule
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       mov     sp, r12                         @ sp is sp
+       bl      _bsaes_key_convert
+       vldmia  sp, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  sp, {q7}
+#else
+       ldr     r12, [r3, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [r3, #244]
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       add     r12, r3, #248                   @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, r3, #248
+       vldmia  r4, {q6}
+       vstmia  r12, {q15}                      @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+
+.align 2
+0:
+#endif
+
+       vld1.8  {q15}, [r8]             @ load IV
+       b       .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+       subs    r2, r2, #0x8
+       bmi     .Lcbc_dec_loop_finish
+
+       vld1.8  {q0-q1}, [r0]!  @ load input
+       vld1.8  {q2-q3}, [r0]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, sp                  @ pass the key
+#else
+       add     r4, r3, #248
+#endif
+       vld1.8  {q4-q5}, [r0]!
+       mov     r5, r10
+       vld1.8  {q6-q7}, [r0]
+       sub     r0, r0, #0x60
+       vstmia  r9, {q15}                       @ put aside IV
+
+       bl      _bsaes_decrypt8
+
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12-q13}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q14-q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q3, q3, q13
+       vst1.8  {q6}, [r1]!
+       veor    q5, q5, q14
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       vst1.8  {q3}, [r1]!
+       vst1.8  {q5}, [r1]!
+
+       b       .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+       adds    r2, r2, #8
+       beq     .Lcbc_dec_done
+
+       vld1.8  {q0}, [r0]!             @ load input
+       cmp     r2, #2
+       blo     .Lcbc_dec_one
+       vld1.8  {q1}, [r0]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, sp                  @ pass the key
+#else
+       add     r4, r3, #248
+#endif
+       mov     r5, r10
+       vstmia  r9, {q15}                       @ put aside IV
+       beq     .Lcbc_dec_two
+       vld1.8  {q2}, [r0]!
+       cmp     r2, #4
+       blo     .Lcbc_dec_three
+       vld1.8  {q3}, [r0]!
+       beq     .Lcbc_dec_four
+       vld1.8  {q4}, [r0]!
+       cmp     r2, #6
+       blo     .Lcbc_dec_five
+       vld1.8  {q5}, [r0]!
+       beq     .Lcbc_dec_six
+       vld1.8  {q6}, [r0]!
+       sub     r0, r0, #0x70
+
+       bl      _bsaes_decrypt8
+
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12-q13}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q3, q3, q13
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       vst1.8  {q3}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+       sub     r0, r0, #0x60
+       bl      _bsaes_decrypt8
+       vldmia  r9,{q14}                        @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+       sub     r0, r0, #0x50
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q15}, [r0]!
+       veor    q4, q4, q10
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q2, q2, q11
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+       sub     r0, r0, #0x40
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q15}, [r0]!
+       veor    q4, q4, q10
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+       sub     r0, r0, #0x30
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q15}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+       sub     r0, r0, #0x20
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8}, [r0]!             @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q15}, [r0]!            @ reload input
+       veor    q1, q1, q8
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+       sub     r0, r0, #0x10
+       mov     r10, r1                 @ save original out pointer
+       mov     r1, r9                  @ use the iv scratch space as out buffer
+       mov     r2, r3
+       vmov    q4,q15          @ just in case ensure that IV
+       vmov    q5,q0                   @ and input are preserved
+       bl      AES_decrypt
+       vld1.8  {q0}, [r9,:64]          @ load result
+       veor    q0, q0, q4      @ ^= IV
+       vmov    q15, q5         @ q5 holds input
+       vst1.8  {q0}, [r10]             @ write output
+
+.Lcbc_dec_done:
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+.Lcbc_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r9
+       bne             .Lcbc_dec_bzero
+#endif
+
+       mov     sp, r9
+       add     sp, #0x10                       @ add sp,r9,#0x10 is no good for thumb
+       vst1.8  {q15}, [r8]             @ return IV
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern        AES_encrypt
+.global        bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+       cmp     r2, #8                  @ use plain AES for
+       blo     .Lctr_enc_short                 @ small sizes
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     r8, [ip]                        @ ctr is 1st arg on the stack
+       sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+       mov     r9, sp                          @ save sp
+
+       ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+       add     r12, #96                        @ size of bit-sliced key schedule
+
+       @ populate the key schedule
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       mov     sp, r12                         @ sp is sp
+       bl      _bsaes_key_convert
+       veor    q7,q7,q15       @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+
+       vld1.8  {q0}, [r8]              @ load counter
+       add     r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+       vldmia  sp, {q4}                @ load round0 key
+#else
+       ldr     r12, [r3, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [r3, #244]
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       add     r12, r3, #248                   @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7,q7,q15       @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+
+.align 2
+0:     add     r12, r3, #248
+       vld1.8  {q0}, [r8]              @ load counter
+       adrl    r8, .LREVM0SR                   @ borrow r8
+       vldmia  r12, {q4}                       @ load round0 key
+       sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+
+       vmov.i32        q8,#1           @ compose 1<<96
+       veor            q9,q9,q9
+       vrev32.8        q0,q0
+       vext.8          q8,q9,q8,#4
+       vrev32.8        q4,q4
+       vadd.u32        q9,q8,q8        @ compose 2<<96
+       vstmia  sp, {q4}                @ save adjusted round0 key
+       b       .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+       vadd.u32        q10, q8, q9     @ compose 3<<96
+       vadd.u32        q1, q0, q8      @ +1
+       vadd.u32        q2, q0, q9      @ +2
+       vadd.u32        q3, q0, q10     @ +3
+       vadd.u32        q4, q1, q10
+       vadd.u32        q5, q2, q10
+       vadd.u32        q6, q3, q10
+       vadd.u32        q7, q4, q10
+       vadd.u32        q10, q5, q10    @ next counter
+
+       @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+       @ to flip byte order in 32-bit counter
+
+       vldmia          sp, {q9}                @ load round0 key
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x10           @ pass next round key
+#else
+       add             r4, r3, #264
+#endif
+       vldmia          r8, {q8}                        @ .LREVM0SR
+       mov             r5, r10                 @ pass rounds
+       vstmia          r9, {q10}                       @ save next counter
+       sub             r6, r8, #.LREVM0SR-.LSR @ pass constants
+
+       bl              _bsaes_encrypt8_alt
+
+       subs            r2, r2, #8
+       blo             .Lctr_enc_loop_done
+
+       vld1.8          {q8-q9}, [r0]!  @ load input
+       vld1.8          {q10-q11}, [r0]!
+       veor            q0, q8
+       veor            q1, q9
+       vld1.8          {q12-q13}, [r0]!
+       veor            q4, q10
+       veor            q6, q11
+       vld1.8          {q14-q15}, [r0]!
+       veor            q3, q12
+       vst1.8          {q0-q1}, [r1]!  @ write output
+       veor            q7, q13
+       veor            q2, q14
+       vst1.8          {q4}, [r1]!
+       veor            q5, q15
+       vst1.8          {q6}, [r1]!
+       vmov.i32        q8, #1                  @ compose 1<<96
+       vst1.8          {q3}, [r1]!
+       veor            q9, q9, q9
+       vst1.8          {q7}, [r1]!
+       vext.8          q8, q9, q8, #4
+       vst1.8          {q2}, [r1]!
+       vadd.u32        q9,q8,q8                @ compose 2<<96
+       vst1.8          {q5}, [r1]!
+       vldmia          r9, {q0}                        @ load counter
+
+       bne             .Lctr_enc_loop
+       b               .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+       add             r2, r2, #8
+       vld1.8          {q8}, [r0]!     @ load input
+       veor            q0, q8
+       vst1.8          {q0}, [r1]!     @ write output
+       cmp             r2, #2
+       blo             .Lctr_enc_done
+       vld1.8          {q9}, [r0]!
+       veor            q1, q9
+       vst1.8          {q1}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q10}, [r0]!
+       veor            q4, q10
+       vst1.8          {q4}, [r1]!
+       cmp             r2, #4
+       blo             .Lctr_enc_done
+       vld1.8          {q11}, [r0]!
+       veor            q6, q11
+       vst1.8          {q6}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q12}, [r0]!
+       veor            q3, q12
+       vst1.8          {q3}, [r1]!
+       cmp             r2, #6
+       blo             .Lctr_enc_done
+       vld1.8          {q13}, [r0]!
+       veor            q7, q13
+       vst1.8          {q7}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q14}, [r0]
+       veor            q2, q14
+       vst1.8          {q2}, [r1]!
+
+.Lctr_enc_done:
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifndef        BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                       @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r9
+       bne             .Lctr_enc_bzero
+#else
+       vstmia          sp, {q0-q1}
+#endif
+
+       mov     sp, r9
+       add     sp, #0x10               @ add sp,r9,#0x10 is no good for thumb
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}       @ return
+
+.align 4
+.Lctr_enc_short:
+       ldr     ip, [sp]                @ ctr pointer is passed on stack
+       stmdb   sp!, {r4-r8, lr}
+
+       mov     r4, r0          @ copy arguments
+       mov     r5, r1
+       mov     r6, r2
+       mov     r7, r3
+       ldr     r8, [ip, #12]           @ load counter LSW
+       vld1.8  {q1}, [ip]              @ load whole counter value
+#ifdef __ARMEL__
+       rev     r8, r8
+#endif
+       sub     sp, sp, #0x10
+       vst1.8  {q1}, [sp,:64]  @ copy counter value
+       sub     sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+       add     r0, sp, #0x10           @ input counter value
+       mov     r1, sp                  @ output on the stack
+       mov     r2, r7                  @ key
+
+       bl      AES_encrypt
+
+       vld1.8  {q0}, [r4]!     @ load input
+       vld1.8  {q1}, [sp,:64]  @ load encrypted counter
+       add     r8, r8, #1
+#ifdef __ARMEL__
+       rev     r0, r8
+       str     r0, [sp, #0x1c]         @ next counter value
+#else
+       str     r8, [sp, #0x1c]         @ next counter value
+#endif
+       veor    q0,q0,q1
+       vst1.8  {q0}, [r5]!     @ store output
+       subs    r6, r6, #1
+       bne     .Lctr_enc_short_loop
+
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+       vstmia          sp!, {q0-q1}
+
+       ldmia   sp!, {r4-r8, pc}
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future r3
+
+       mov     r7, r0
+       mov     r8, r1
+       mov     r9, r2
+       mov     r10, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0,sp                           @ pointer to initial tweak
+#endif
+
+       ldr     r1, [r10, #240]         @ get # of rounds
+       mov     r3, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+       @ add   r12, #96                        @ size of bit-sliced key schedule
+       sub     r12, #48                        @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7, q7, q15     @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+#else
+       ldr     r12, [r10, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [r10, #244]
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       add     r12, r10, #248                  @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7, q7, q15     @ fix up last round key
+       vstmia  r12, {q7}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+
+       vld1.8  {q8}, [r0]                      @ initial tweak
+       adr     r2, .Lxts_magic
+
+       subs    r9, #0x80
+       blo     .Lxts_enc_short
+       b       .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q6, q8, #63
+       mov             r0, sp
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q9, #63
+       veor            q9, q9, q6
+       vand            q7, q7, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q10, #63
+       veor            q10, q10, q7
+       vand            q6, q6, q5
+       vld1.8          {q0}, [r7]!
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q11, #63
+       veor            q11, q11, q6
+       vand            q7, q7, q5
+       vld1.8          {q1}, [r7]!
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q12, #63
+       veor            q12, q12, q7
+       vand            q6, q6, q5
+       vld1.8          {q2}, [r7]!
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q13, #63
+       veor            q13, q13, q6
+       vand            q7, q7, q5
+       vld1.8          {q3}, [r7]!
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q14, #63
+       veor            q14, q14, q7
+       vand            q6, q6, q5
+       vld1.8          {q4}, [r7]!
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q15, #63
+       veor            q15, q15, q6
+       vand            q7, q7, q5
+       vld1.8          {q5}, [r7]!
+       veor            q4, q4, q12
+       vadd.u64        q8, q15, q15
+       vst1.64         {q15}, [r0,:128]!
+       vswp            d15,d14
+       veor            q8, q8, q7
+       vst1.64         {q8}, [r0,:128]         @ next round tweak
+
+       vld1.8          {q6-q7}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       veor            q7, q7, q15
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vld1.64         {q14-q15}, [r0,:128]!
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q2, q14
+       vst1.8          {q10-q11}, [r8]!
+       veor            q13, q5, q15
+       vst1.8          {q12-q13}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+
+       subs            r9, #0x80
+       bpl             .Lxts_enc_loop
+
+.Lxts_enc_short:
+       adds            r9, #0x70
+       bmi             .Lxts_enc_done
+
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q7, q8, #63
+       mov             r0, sp
+       vand            q7, q7, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q9, #63
+       veor            q9, q9, q7
+       vand            q6, q6, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q10, #63
+       veor            q10, q10, q6
+       vand            q7, q7, q5
+       vld1.8          {q0}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_1
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q11, #63
+       veor            q11, q11, q7
+       vand            q6, q6, q5
+       vld1.8          {q1}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_2
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q12, #63
+       veor            q12, q12, q6
+       vand            q7, q7, q5
+       vld1.8          {q2}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_3
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q13, #63
+       veor            q13, q13, q7
+       vand            q6, q6, q5
+       vld1.8          {q3}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_4
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q14, #63
+       veor            q14, q14, q6
+       vand            q7, q7, q5
+       vld1.8          {q4}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_5
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q15, #63
+       veor            q15, q15, q7
+       vand            q6, q6, q5
+       vld1.8          {q5}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_6
+       veor            q4, q4, q12
+       sub             r9, #0x10
+       vst1.64         {q15}, [r0,:128]                @ next round tweak
+
+       vld1.8          {q6}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vld1.64         {q14}, [r0,:128]!
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q2, q14
+       vst1.8          {q10-q11}, [r8]!
+       vst1.8          {q12}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+       vst1.64         {q14}, [r0,:128]                @ next round tweak
+
+       veor            q4, q4, q12
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q5, q5, q13
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       vst1.8          {q10-q11}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+       .quad   1, 0x87
+
+.align 5
+.Lxts_enc_5:
+       vst1.64         {q13}, [r0,:128]                @ next round tweak
+
+       veor            q3, q3, q11
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q4, q4, q12
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       vst1.8          {q10}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+       vst1.64         {q12}, [r0,:128]                @ next round tweak
+
+       veor            q2, q2, q10
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q3, q3, q11
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vst1.8          {q8-q9}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+       vst1.64         {q11}, [r0,:128]                @ next round tweak
+
+       veor            q1, q1, q9
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q2, q2, q10
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       vst1.8          {q8}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+       vst1.64         {q10}, [r0,:128]                @ next round tweak
+
+       veor            q0, q0, q8
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q1, q1, q9
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       vst1.8          {q0-q1}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                          @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r8]!
+       mov             r3, r4
+
+       vmov            q8, q9          @ next round tweak
+
+.Lxts_enc_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            r9, #0x10
+       beq             .Lxts_enc_ret
+       sub             r6, r8, #0x10
+
+.Lxts_enc_steal:
+       ldrb            r0, [r7], #1
+       ldrb            r1, [r8, #-0x10]
+       strb            r0, [r8, #-0x10]
+       strb            r1, [r8], #1
+
+       subs            r9, #1
+       bhi             .Lxts_enc_steal
+
+       vld1.8          {q0}, [r6]
+       mov             r0, sp
+       veor            q0, q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                  @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r6]
+       mov             r3, r4
+#endif
+
+.Lxts_enc_ret:
+       bic             r0, r3, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_enc_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_enc_bzero
+
+       mov             sp, r3
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {q8}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future r3
+
+       mov     r7, r0
+       mov     r8, r1
+       mov     r9, r2
+       mov     r10, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0, sp                          @ pointer to initial tweak
+#endif
+
+       ldr     r1, [r10, #240]         @ get # of rounds
+       mov     r3, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+       @ add   r12, #96                        @ size of bit-sliced key schedule
+       sub     r12, #48                        @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, sp, #0x90
+       vldmia  r4, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+#else
+       ldr     r12, [r10, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [r10, #244]
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       add     r12, r10, #248                  @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, r10, #248
+       vldmia  r4, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+       vld1.8  {q8}, [r0]                      @ initial tweak
+       adr     r2, .Lxts_magic
+
+       tst     r9, #0xf                        @ if not multiple of 16
+       it      ne                              @ Thumb2 thing, sanity check in ARM
+       subne   r9, #0x10                       @ subtract another 16 bytes
+       subs    r9, #0x80
+
+       blo     .Lxts_dec_short
+       b       .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q6, q8, #63
+       mov             r0, sp
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q9, #63
+       veor            q9, q9, q6
+       vand            q7, q7, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q10, #63
+       veor            q10, q10, q7
+       vand            q6, q6, q5
+       vld1.8          {q0}, [r7]!
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q11, #63
+       veor            q11, q11, q6
+       vand            q7, q7, q5
+       vld1.8          {q1}, [r7]!
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q12, #63
+       veor            q12, q12, q7
+       vand            q6, q6, q5
+       vld1.8          {q2}, [r7]!
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q13, #63
+       veor            q13, q13, q6
+       vand            q7, q7, q5
+       vld1.8          {q3}, [r7]!
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q14, #63
+       veor            q14, q14, q7
+       vand            q6, q6, q5
+       vld1.8          {q4}, [r7]!
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q15, #63
+       veor            q15, q15, q6
+       vand            q7, q7, q5
+       vld1.8          {q5}, [r7]!
+       veor            q4, q4, q12
+       vadd.u64        q8, q15, q15
+       vst1.64         {q15}, [r0,:128]!
+       vswp            d15,d14
+       veor            q8, q8, q7
+       vst1.64         {q8}, [r0,:128]         @ next round tweak
+
+       vld1.8          {q6-q7}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       veor            q7, q7, q15
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vld1.64         {q14-q15}, [r0,:128]!
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q3, q14
+       vst1.8          {q10-q11}, [r8]!
+       veor            q13, q5, q15
+       vst1.8          {q12-q13}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+
+       subs            r9, #0x80
+       bpl             .Lxts_dec_loop
+
+.Lxts_dec_short:
+       adds            r9, #0x70
+       bmi             .Lxts_dec_done
+
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q7, q8, #63
+       mov             r0, sp
+       vand            q7, q7, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q9, #63
+       veor            q9, q9, q7
+       vand            q6, q6, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q10, #63
+       veor            q10, q10, q6
+       vand            q7, q7, q5
+       vld1.8          {q0}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_1
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q11, #63
+       veor            q11, q11, q7
+       vand            q6, q6, q5
+       vld1.8          {q1}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_2
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q12, #63
+       veor            q12, q12, q6
+       vand            q7, q7, q5
+       vld1.8          {q2}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_3
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q13, #63
+       veor            q13, q13, q7
+       vand            q6, q6, q5
+       vld1.8          {q3}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_4
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q14, #63
+       veor            q14, q14, q6
+       vand            q7, q7, q5
+       vld1.8          {q4}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_5
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q15, #63
+       veor            q15, q15, q7
+       vand            q6, q6, q5
+       vld1.8          {q5}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_6
+       veor            q4, q4, q12
+       sub             r9, #0x10
+       vst1.64         {q15}, [r0,:128]                @ next round tweak
+
+       vld1.8          {q6}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vld1.64         {q14}, [r0,:128]!
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q3, q14
+       vst1.8          {q10-q11}, [r8]!
+       vst1.8          {q12}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+       vst1.64         {q14}, [r0,:128]                @ next round tweak
+
+       veor            q4, q4, q12
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q5, q5, q13
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       vst1.8          {q10-q11}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+       vst1.64         {q13}, [r0,:128]                @ next round tweak
+
+       veor            q3, q3, q11
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q4, q4, q12
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       vst1.8          {q10}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+       vst1.64         {q12}, [r0,:128]                @ next round tweak
+
+       veor            q2, q2, q10
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q3, q3, q11
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vst1.8          {q8-q9}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+       vst1.64         {q11}, [r0,:128]                @ next round tweak
+
+       veor            q1, q1, q9
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q2, q2, q10
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       vst1.8          {q8}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+       vst1.64         {q10}, [r0,:128]                @ next round tweak
+
+       veor            q0, q0, q8
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q1, q1, q9
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       vst1.8          {q0-q1}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                          @ preserve fp
+       mov             r5, r2                  @ preserve magic
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r8]!
+       mov             r3, r4
+       mov             r2, r5
+
+       vmov            q8, q9          @ next round tweak
+
+.Lxts_dec_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            r9, #0x10
+       beq             .Lxts_dec_ret
+
+       @ calculate one round of extra tweak for the stolen ciphertext
+       vldmia          r2, {q5}
+       vshr.s64        q6, q8, #63
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vswp            d13,d12
+       veor            q9, q9, q6
+
+       @ perform the final decryption with the last tweak value
+       vld1.8          {q0}, [r7]!
+       mov             r0, sp
+       veor            q0, q0, q9
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                  @ preserve fp
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q9
+       vst1.8          {q0}, [r8]
+
+       mov             r6, r8
+.Lxts_dec_steal:
+       ldrb            r1, [r8]
+       ldrb            r0, [r7], #1
+       strb            r1, [r8, #0x10]
+       strb            r0, [r8], #1
+
+       subs            r9, #1
+       bhi             .Lxts_dec_steal
+
+       vld1.8          {q0}, [r6]
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r6]
+       mov             r3, r4
+#endif
+
+.Lxts_dec_ret:
+       bic             r0, r3, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_dec_bzero
+
+       mov             sp, r3
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {q8}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c

new file mode 100644 (file)

index 0000000..4522366
--- /dev/null
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+
+#include "aes_glue.h"
+
+#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+
+struct BS_KEY {
+       struct AES_KEY  rk;
+       int             converted;
+       u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 iv[]);
+
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+                                          struct BS_KEY *key, u8 const iv[]);
+
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 tweak[]);
+
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 tweak[]);
+
+struct aesbs_cbc_ctx {
+       struct AES_KEY  enc;
+       struct BS_KEY   dec;
+};
+
+struct aesbs_ctr_ctx {
+       struct BS_KEY   enc;
+};
+
+struct aesbs_xts_ctx {
+       struct BS_KEY   enc;
+       struct BS_KEY   dec;
+       struct AES_KEY  twkey;
+};
+
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 8;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->dec.rk = ctx->enc;
+       private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+       ctx->dec.converted = 0;
+       return 0;
+}
+
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 8;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->enc.converted = 0;
+       return 0;
+}
+
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 4;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->dec.rk = ctx->enc.rk;
+       private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+       private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+       ctx->enc.converted = ctx->dec.converted = 0;
+       return 0;
+}
+
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt(desc, &walk);
+
+       while (walk.nbytes) {
+               u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+               u8 *src = walk.src.virt.addr;
+
+               if (walk.dst.virt.addr == walk.src.virt.addr) {
+                       u8 *iv = walk.iv;
+
+                       do {
+                               crypto_xor(src, iv, AES_BLOCK_SIZE);
+                               AES_encrypt(src, src, &ctx->enc);
+                               iv = src;
+                               src += AES_BLOCK_SIZE;
+                       } while (--blocks);
+                       memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+               } else {
+                       u8 *dst = walk.dst.virt.addr;
+
+                       do {
+                               crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+                               AES_encrypt(walk.iv, dst, &ctx->enc);
+                               memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+                               src += AES_BLOCK_SIZE;
+                               dst += AES_BLOCK_SIZE;
+                       } while (--blocks);
+               }
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+               kernel_neon_begin();
+               bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->dec, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       while (walk.nbytes) {
+               u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+               u8 *dst = walk.dst.virt.addr;
+               u8 *src = walk.src.virt.addr;
+               u8 bk[2][AES_BLOCK_SIZE];
+               u8 *iv = walk.iv;
+
+               do {
+                       if (walk.dst.virt.addr == walk.src.virt.addr)
+                               memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+
+                       AES_decrypt(src, dst, &ctx->dec.rk);
+                       crypto_xor(dst, iv, AES_BLOCK_SIZE);
+
+                       if (walk.dst.virt.addr == walk.src.virt.addr)
+                               iv = bk[blocks & 1];
+                       else
+                               iv = src;
+
+                       dst += AES_BLOCK_SIZE;
+                       src += AES_BLOCK_SIZE;
+               } while (--blocks);
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+       int i;
+
+       for (i = 3; i >= 0; i--, addend = 1) {
+               u32 n = be32_to_cpu(ctr[i]) + addend;
+
+               ctr[i] = cpu_to_be32(n);
+               if (n >= addend)
+                       break;
+       }
+}
+
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst, struct scatterlist *src,
+                            unsigned int nbytes)
+{
+       struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       u32 blocks;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+               u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+               __be32 *ctr = (__be32 *)walk.iv;
+               u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+
+               /* avoid 32 bit counter overflow in the NEON code */
+               if (unlikely(headroom < blocks)) {
+                       blocks = headroom + 1;
+                       tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+               }
+               kernel_neon_begin();
+               bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+                                          walk.dst.virt.addr, blocks,
+                                          &ctx->enc, walk.iv);
+               kernel_neon_end();
+               inc_be128_ctr(ctr, blocks);
+
+               nbytes -= blocks * AES_BLOCK_SIZE;
+               if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+                       break;
+
+               err = blkcipher_walk_done(desc, &walk, tail);
+       }
+       if (walk.nbytes) {
+               u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+               u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+               u8 ks[AES_BLOCK_SIZE];
+
+               AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+               if (tdst != tsrc)
+                       memcpy(tdst, tsrc, nbytes);
+               crypto_xor(tdst, ks, nbytes);
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       /* generate the initial tweak */
+       AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+       while (walk.nbytes) {
+               kernel_neon_begin();
+               bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->enc, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       /* generate the initial tweak */
+       AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+       while (walk.nbytes) {
+               kernel_neon_begin();
+               bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->dec, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static struct crypto_alg aesbs_algs[] = { {
+       .cra_name               = "__cbc-aes-neonbs",
+       .cra_driver_name        = "__driver-cbc-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct aesbs_cbc_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_cbc_set_key,
+               .encrypt        = aesbs_cbc_encrypt,
+               .decrypt        = aesbs_cbc_decrypt,
+       },
+}, {
+       .cra_name               = "__ctr-aes-neonbs",
+       .cra_driver_name        = "__driver-ctr-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = 1,
+       .cra_ctxsize            = sizeof(struct aesbs_ctr_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_ctr_set_key,
+               .encrypt        = aesbs_ctr_encrypt,
+               .decrypt        = aesbs_ctr_encrypt,
+       },
+}, {
+       .cra_name               = "__xts-aes-neonbs",
+       .cra_driver_name        = "__driver-xts-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct aesbs_xts_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+               .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_xts_set_key,
+               .encrypt        = aesbs_xts_encrypt,
+               .decrypt        = aesbs_xts_decrypt,
+       },
+}, {
+       .cra_name               = "cbc(aes)",
+       .cra_driver_name        = "cbc-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = __ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+}, {
+       .cra_name               = "ctr(aes)",
+       .cra_driver_name        = "ctr-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = 1,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+}, {
+       .cra_name               = "xts(aes)",
+       .cra_driver_name        = "xts-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+               .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+} };
+
+static int __init aesbs_mod_init(void)
+{
+       if (!cpu_has_neon())
+               return -ENODEV;
+
+       return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+static void __exit aesbs_mod_exit(void)
+{
+       crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl

new file mode 100644 (file)

index 0000000..f3d96d9
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt      19.5 cycles per byte processed with 128-bit key
+# decrypt      22.1 cycles per byte processed with 128-bit key
+# key conv.    440  cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+#                                              <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+#                                      <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InBasisChange  (@b);
+       &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
+       &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+       veor    @b[2], @b[2], @b[1]
+       veor    @b[5], @b[5], @b[6]
+       veor    @b[3], @b[3], @b[0]
+       veor    @b[6], @b[6], @b[2]
+       veor    @b[5], @b[5], @b[0]
+
+       veor    @b[6], @b[6], @b[3]
+       veor    @b[3], @b[3], @b[7]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[3], @b[3], @b[4]
+       veor    @b[4], @b[4], @b[5]
+
+       veor    @b[2], @b[2], @b[7]
+       veor    @b[3], @b[3], @b[1]
+       veor    @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+       veor    @b[0], @b[0], @b[6]
+       veor    @b[1], @b[1], @b[4]
+       veor    @b[4], @b[4], @b[6]
+       veor    @b[2], @b[2], @b[0]
+       veor    @b[6], @b[6], @b[1]
+
+       veor    @b[1], @b[1], @b[5]
+       veor    @b[5], @b[5], @b[3]
+       veor    @b[3], @b[3], @b[7]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[2], @b[2], @b[5]
+
+       veor    @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb         > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb        > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InvInBasisChange       (@b);
+       &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
+       &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {         # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+        veor   @b[1], @b[1], @b[7]
+       veor    @b[4], @b[4], @b[7]
+
+       veor    @b[7], @b[7], @b[5]
+        veor   @b[1], @b[1], @b[3]
+       veor    @b[2], @b[2], @b[5]
+       veor    @b[3], @b[3], @b[7]
+
+       veor    @b[6], @b[6], @b[1]
+       veor    @b[2], @b[2], @b[0]
+        veor   @b[5], @b[5], @b[3]
+       veor    @b[4], @b[4], @b[6]
+       veor    @b[0], @b[0], @b[6]
+       veor    @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange {                # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+       veor    @b[1], @b[1], @b[5]
+       veor    @b[2], @b[2], @b[7]
+
+       veor    @b[3], @b[3], @b[1]
+       veor    @b[4], @b[4], @b[5]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[3], @b[3], @b[4]
+        veor   @b[5], @b[5], @b[0]
+       veor    @b[3], @b[3], @b[7]
+        veor   @b[6], @b[6], @b[2]
+        veor   @b[2], @b[2], @b[1]
+       veor    @b[6], @b[6], @b[3]
+
+       veor    @b[3], @b[3], @b[0]
+       veor    @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+       vand    $t0, $t0, $x0
+       veor    $x0, $x0, $x1
+       vand    $t1, $x1, $y0
+       vand    $x0, $x0, $y1
+       veor    $x1, $t1, $t0
+       veor    $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N {                                # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+       vand    $t0, $t0, $x0
+       veor    $x0, $x0, $x1
+       vand    $x1, $x1, $y0
+       vand    $x0, $x0, $y1
+       veor    $x1, $x1, $x0
+       veor    $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+        veor   $t1, $y2, $y3
+       vand    $t0, $t0, $x0
+        vand   $t1, $t1, $x2
+       veor    $x0, $x0, $x1
+        veor   $x2, $x2, $x3
+       vand    $x1, $x1, $y0
+        vand   $x3, $x3, $y2
+       vand    $x0, $x0, $y1
+        vand   $x2, $x2, $y3
+       veor    $x1, $x1, $x0
+        veor   $x2, $x2, $x3
+       veor    $x0, $x0, $t0
+        veor   $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+       veor    @t[0], @x[0], @x[2]
+       veor    @t[1], @x[1], @x[3]
+___
+       &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+       veor    @y[0], @y[0], @y[2]
+       veor    @y[1], @y[1], @y[3]
+___
+       Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       veor    @x[0], @x[0], @t[0]
+       veor    @x[2], @x[2], @t[0]
+       veor    @x[1], @x[1], @t[1]
+       veor    @x[3], @x[3], @t[1]
+
+       veor    @t[0], @x[4], @x[6]
+       veor    @t[1], @x[5], @x[7]
+___
+       &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       veor    @y[0], @y[0], @y[2]
+       veor    @y[1], @y[1], @y[3]
+___
+       &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+       veor    @x[4], @x[4], @t[0]
+       veor    @x[6], @x[6], @t[0]
+       veor    @x[5], @x[5], @t[1]
+       veor    @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+       veor    @t[3], @x[4], @x[6]
+       veor    @t[2], @x[5], @x[7]
+       veor    @t[1], @x[1], @x[3]
+       veor    @s[1], @x[7], @x[6]
+        vmov   @t[0], @t[2]
+       veor    @s[0], @x[0], @x[2]
+
+       vorr    @t[2], @t[2], @t[1]
+       veor    @s[3], @t[3], @t[0]
+       vand    @s[2], @t[3], @s[0]
+       vorr    @t[3], @t[3], @s[0]
+       veor    @s[0], @s[0], @t[1]
+       vand    @t[0], @t[0], @t[1]
+       veor    @t[1], @x[3], @x[2]
+       vand    @s[3], @s[3], @s[0]
+       vand    @s[1], @s[1], @t[1]
+       veor    @t[1], @x[4], @x[5]
+       veor    @s[0], @x[1], @x[0]
+       veor    @t[3], @t[3], @s[1]
+       veor    @t[2], @t[2], @s[1]
+       vand    @s[1], @t[1], @s[0]
+       vorr    @t[1], @t[1], @s[0]
+       veor    @t[3], @t[3], @s[3]
+       veor    @t[0], @t[0], @s[1]
+       veor    @t[2], @t[2], @s[2]
+       veor    @t[1], @t[1], @s[3]
+       veor    @t[0], @t[0], @s[2]
+       vand    @s[0], @x[7], @x[3]
+       veor    @t[1], @t[1], @s[2]
+       vand    @s[1], @x[6], @x[2]
+       vand    @s[2], @x[5], @x[1]
+       vorr    @s[3], @x[4], @x[0]
+       veor    @t[3], @t[3], @s[0]
+       veor    @t[1], @t[1], @s[2]
+       veor    @t[0], @t[0], @s[3]
+       veor    @t[2], @t[2], @s[1]
+
+       @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+       @ new smaller inversion
+
+       vand    @s[2], @t[3], @t[1]
+       vmov    @s[0], @t[0]
+
+       veor    @s[1], @t[2], @s[2]
+       veor    @s[3], @t[0], @s[2]
+       veor    @s[2], @t[0], @s[2]     @ @s[2]=@s[3]
+
+       vbsl    @s[1], @t[1], @t[0]
+       vbsl    @s[3], @t[3], @t[2]
+       veor    @t[3], @t[3], @t[2]
+
+       vbsl    @s[0], @s[1], @s[2]
+       vbsl    @t[0], @s[2], @s[1]
+
+       vand    @s[2], @s[0], @s[3]
+       veor    @t[1], @t[1], @t[0]
+
+       veor    @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+       &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+       vldmia  $key!, {@t[0]-@t[3]}
+       veor    @t[0], @t[0], @x[0]
+       veor    @t[1], @t[1], @x[1]
+       vtbl.8  `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[0]}
+       veor    @t[2], @t[2], @x[2]
+       vtbl.8  `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[1]}
+       veor    @t[3], @t[3], @x[3]
+       vtbl.8  `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[2]}
+       vtbl.8  `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[3]}
+       veor    @t[0], @t[0], @x[4]
+       veor    @t[1], @t[1], @x[5]
+       vtbl.8  `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+       veor    @t[2], @t[2], @x[6]
+       vtbl.8  `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+       veor    @t[3], @t[3], @x[7]
+       vtbl.8  `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+       vtbl.8  `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16];        # optional
+$code.=<<___;
+       vext.8  @t[0], @x[0], @x[0], #12        @ x0 <<< 32
+       vext.8  @t[1], @x[1], @x[1], #12
+        veor   @x[0], @x[0], @t[0]             @ x0 ^ (x0 <<< 32)
+       vext.8  @t[2], @x[2], @x[2], #12
+        veor   @x[1], @x[1], @t[1]
+       vext.8  @t[3], @x[3], @x[3], #12
+        veor   @x[2], @x[2], @t[2]
+       vext.8  @t[4], @x[4], @x[4], #12
+        veor   @x[3], @x[3], @t[3]
+       vext.8  @t[5], @x[5], @x[5], #12
+        veor   @x[4], @x[4], @t[4]
+       vext.8  @t[6], @x[6], @x[6], #12
+        veor   @x[5], @x[5], @t[5]
+       vext.8  @t[7], @x[7], @x[7], #12
+        veor   @x[6], @x[6], @t[6]
+
+       veor    @t[1], @t[1], @x[0]
+        veor   @x[7], @x[7], @t[7]
+        vext.8 @x[0], @x[0], @x[0], #8         @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    @t[2], @t[2], @x[1]
+       veor    @t[0], @t[0], @x[7]
+       veor    @t[1], @t[1], @x[7]
+        vext.8 @x[1], @x[1], @x[1], #8
+       veor    @t[5], @t[5], @x[4]
+        veor   @x[0], @x[0], @t[0]
+       veor    @t[6], @t[6], @x[5]
+        veor   @x[1], @x[1], @t[1]
+        vext.8 @t[0], @x[4], @x[4], #8
+       veor    @t[4], @t[4], @x[3]
+        vext.8 @t[1], @x[5], @x[5], #8
+       veor    @t[7], @t[7], @x[6]
+        vext.8 @x[4], @x[3], @x[3], #8
+       veor    @t[3], @t[3], @x[2]
+        vext.8 @x[5], @x[7], @x[7], #8
+       veor    @t[4], @t[4], @x[7]
+        vext.8 @x[3], @x[6], @x[6], #8
+       veor    @t[3], @t[3], @x[7]
+        vext.8 @x[6], @x[2], @x[2], #8
+       veor    @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+       veor    @x[2], @t[0], @t[4]
+       veor    @x[4], @x[4], @t[3]
+       veor    @x[5], @x[5], @t[7]
+       veor    @x[3], @x[3], @t[6]
+        @ vmov @x[2], @t[0]
+       veor    @x[6], @x[6], @t[2]
+        @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+       veor    @t[3], @t[3], @x[4]
+       veor    @x[5], @x[5], @t[7]
+       veor    @x[2], @x[3], @t[6]
+       veor    @x[3], @t[0], @t[4]
+       veor    @x[4], @x[6], @t[2]
+       vmov    @x[6], @t[3]
+        @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+       @ multiplication by 0x0e
+       vext.8  @t[7], @x[7], @x[7], #12
+       vmov    @t[2], @x[2]
+       veor    @x[2], @x[2], @x[5]             @ 2 5
+       veor    @x[7], @x[7], @x[5]             @ 7 5
+       vext.8  @t[0], @x[0], @x[0], #12
+       vmov    @t[5], @x[5]
+       veor    @x[5], @x[5], @x[0]             @ 5 0           [1]
+       veor    @x[0], @x[0], @x[1]             @ 0 1
+       vext.8  @t[1], @x[1], @x[1], #12
+       veor    @x[1], @x[1], @x[2]             @ 1 25
+       veor    @x[0], @x[0], @x[6]             @ 01 6          [2]
+       vext.8  @t[3], @x[3], @x[3], #12
+       veor    @x[1], @x[1], @x[3]             @ 125 3         [4]
+       veor    @x[2], @x[2], @x[0]             @ 25 016        [3]
+       veor    @x[3], @x[3], @x[7]             @ 3 75
+       veor    @x[7], @x[7], @x[6]             @ 75 6          [0]
+       vext.8  @t[6], @x[6], @x[6], #12
+       vmov    @t[4], @x[4]
+       veor    @x[6], @x[6], @x[4]             @ 6 4
+       veor    @x[4], @x[4], @x[3]             @ 4 375         [6]
+       veor    @x[3], @x[3], @x[7]             @ 375 756=36
+       veor    @x[6], @x[6], @t[5]             @ 64 5          [7]
+       veor    @x[3], @x[3], @t[2]             @ 36 2
+       vext.8  @t[5], @t[5], @t[5], #12
+       veor    @x[3], @x[3], @t[4]             @ 362 4         [5]
+___
+                                       my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+       @ multiplication by 0x0b
+       veor    @y[1], @y[1], @y[0]
+       veor    @y[0], @y[0], @t[0]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[1], @y[1], @t[1]
+       veor    @y[0], @y[0], @t[5]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[1], @y[1], @t[6]
+       veor    @y[0], @y[0], @t[7]
+       veor    @t[7], @t[7], @t[6]             @ clobber t[7]
+
+       veor    @y[3], @y[3], @t[0]
+        veor   @y[1], @y[1], @y[0]
+       vext.8  @t[0], @t[0], @t[0], #12
+       veor    @y[2], @y[2], @t[1]
+       veor    @y[4], @y[4], @t[1]
+       vext.8  @t[1], @t[1], @t[1], #12
+       veor    @y[2], @y[2], @t[2]
+       veor    @y[3], @y[3], @t[2]
+       veor    @y[5], @y[5], @t[2]
+       veor    @y[2], @y[2], @t[7]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[3], @y[3], @t[3]
+       veor    @y[6], @y[6], @t[3]
+       veor    @y[4], @y[4], @t[3]
+       veor    @y[7], @y[7], @t[4]
+       vext.8  @t[3], @t[3], @t[3], #12
+       veor    @y[5], @y[5], @t[4]
+       veor    @y[7], @y[7], @t[7]
+       veor    @t[7], @t[7], @t[5]             @ clobber t[7] even more
+       veor    @y[3], @y[3], @t[5]
+       veor    @y[4], @y[4], @t[4]
+
+       veor    @y[5], @y[5], @t[7]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[6], @y[6], @t[7]
+       veor    @y[4], @y[4], @t[7]
+
+       veor    @t[7], @t[7], @t[5]
+       vext.8  @t[5], @t[5], @t[5], #12
+
+       @ multiplication by 0x0d
+       veor    @y[4], @y[4], @y[7]
+        veor   @t[7], @t[7], @t[6]             @ restore t[7]
+       veor    @y[7], @y[7], @t[4]
+       vext.8  @t[6], @t[6], @t[6], #12
+       veor    @y[2], @y[2], @t[0]
+       veor    @y[7], @y[7], @t[5]
+       vext.8  @t[7], @t[7], @t[7], #12
+       veor    @y[2], @y[2], @t[2]
+
+       veor    @y[3], @y[3], @y[1]
+       veor    @y[1], @y[1], @t[1]
+       veor    @y[0], @y[0], @t[0]
+       veor    @y[3], @y[3], @t[0]
+       veor    @y[1], @y[1], @t[5]
+       veor    @y[0], @y[0], @t[5]
+       vext.8  @t[0], @t[0], @t[0], #12
+       veor    @y[1], @y[1], @t[7]
+       veor    @y[0], @y[0], @t[6]
+       veor    @y[3], @y[3], @y[1]
+       veor    @y[4], @y[4], @t[1]
+       vext.8  @t[1], @t[1], @t[1], #12
+
+       veor    @y[7], @y[7], @t[7]
+       veor    @y[4], @y[4], @t[2]
+       veor    @y[5], @y[5], @t[2]
+       veor    @y[2], @y[2], @t[6]
+       veor    @t[6], @t[6], @t[3]             @ clobber t[6]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[4], @y[4], @y[7]
+       veor    @y[3], @y[3], @t[6]
+
+       veor    @y[6], @y[6], @t[6]
+       veor    @y[5], @y[5], @t[5]
+       vext.8  @t[5], @t[5], @t[5], #12
+       veor    @y[6], @y[6], @t[4]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[5], @y[5], @t[6]
+       veor    @y[6], @y[6], @t[7]
+       vext.8  @t[7], @t[7], @t[7], #12
+       veor    @t[6], @t[6], @t[3]             @ restore t[6]
+       vext.8  @t[3], @t[3], @t[3], #12
+
+       @ multiplication by 0x09
+       veor    @y[4], @y[4], @y[1]
+       veor    @t[1], @t[1], @y[1]             @ t[1]=y[1]
+       veor    @t[0], @t[0], @t[5]             @ clobber t[0]
+       vext.8  @t[6], @t[6], @t[6], #12
+       veor    @t[1], @t[1], @t[5]
+       veor    @y[3], @y[3], @t[0]
+       veor    @t[0], @t[0], @y[0]             @ t[0]=y[0]
+       veor    @t[1], @t[1], @t[6]
+       veor    @t[6], @t[6], @t[7]             @ clobber t[6]
+       veor    @y[4], @y[4], @t[1]
+       veor    @y[7], @y[7], @t[4]
+       veor    @y[6], @y[6], @t[3]
+       veor    @y[5], @y[5], @t[2]
+       veor    @t[4], @t[4], @y[4]             @ t[4]=y[4]
+       veor    @t[3], @t[3], @y[3]             @ t[3]=y[3]
+       veor    @t[5], @t[5], @y[5]             @ t[5]=y[5]
+       veor    @t[2], @t[2], @y[2]             @ t[2]=y[2]
+       veor    @t[3], @t[3], @t[7]
+       veor    @XMM[5], @t[5], @t[6]
+       veor    @XMM[6], @t[6], @y[6]           @ t[6]=y[6]
+       veor    @XMM[2], @t[2], @t[6]
+       veor    @XMM[7], @t[7], @y[7]           @ t[7]=y[7]
+
+       vmov    @XMM[0], @t[0]
+       vmov    @XMM[1], @t[1]
+       @ vmov  @XMM[2], @t[2]
+       vmov    @XMM[3], @t[3]
+       vmov    @XMM[4], @t[4]
+       @ vmov  @XMM[5], @t[5]
+       @ vmov  @XMM[6], @t[6]
+       @ vmov  @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+       @ multiplication by 0x05-0x00-0x04-0x00
+       vext.8  @t[0], @x[0], @x[0], #8
+       vext.8  @t[6], @x[6], @x[6], #8
+       vext.8  @t[7], @x[7], @x[7], #8
+       veor    @t[0], @t[0], @x[0]
+       vext.8  @t[1], @x[1], @x[1], #8
+       veor    @t[6], @t[6], @x[6]
+       vext.8  @t[2], @x[2], @x[2], #8
+       veor    @t[7], @t[7], @x[7]
+       vext.8  @t[3], @x[3], @x[3], #8
+       veor    @t[1], @t[1], @x[1]
+       vext.8  @t[4], @x[4], @x[4], #8
+       veor    @t[2], @t[2], @x[2]
+       vext.8  @t[5], @x[5], @x[5], #8
+       veor    @t[3], @t[3], @x[3]
+       veor    @t[4], @t[4], @x[4]
+       veor    @t[5], @t[5], @x[5]
+
+        veor   @x[0], @x[0], @t[6]
+        veor   @x[1], @x[1], @t[6]
+        veor   @x[2], @x[2], @t[0]
+        veor   @x[4], @x[4], @t[2]
+        veor   @x[3], @x[3], @t[1]
+        veor   @x[1], @x[1], @t[7]
+        veor   @x[2], @x[2], @t[7]
+        veor   @x[4], @x[4], @t[6]
+        veor   @x[5], @x[5], @t[3]
+        veor   @x[3], @x[3], @t[6]
+        veor   @x[6], @x[6], @t[4]
+        veor   @x[4], @x[4], @t[7]
+        veor   @x[5], @x[5], @t[7]
+        veor   @x[7], @x[7], @t[5]
+___
+       &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+       vshr.u64        $t, $b, #$n
+       veor            $t, $t, $a
+       vand            $t, $t, $mask
+       veor            $a, $a, $t
+       vshl.u64        $t, $t, #$n
+       veor            $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+       vshr.u64        $t0, $b0, #$n
+        vshr.u64       $t1, $b1, #$n
+       veor            $t0, $t0, $a0
+        veor           $t1, $t1, $a1
+       vand            $t0, $t0, $mask
+        vand           $t1, $t1, $mask
+       veor            $a0, $a0, $t0
+       vshl.u64        $t0, $t0, #$n
+        veor           $a1, $a1, $t1
+        vshl.u64       $t1, $t1, #$n
+       veor            $b0, $b0, $t0
+        veor           $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+       vmov.i8 $t0,#0x55                       @ compose .LBS0
+       vmov.i8 $t1,#0x33                       @ compose .LBS1
+___
+       &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+       &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+       vmov.i8 $t0,#0x0f                       @ compose .LBS2
+___
+       &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+       &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+       &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+       &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax        unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu   neon
+
+.type  _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+       adr     $const,_bsaes_decrypt8
+       vldmia  $key!, {@XMM[9]}                @ round 0 key
+       add     $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+       vldmia  $const!, {@XMM[8]}              @ .LM0ISR
+       veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+       veor    @XMM[11], @XMM[1], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+       veor    @XMM[12], @XMM[2], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+       veor    @XMM[13], @XMM[3], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+       veor    @XMM[14], @XMM[4], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+       veor    @XMM[15], @XMM[5], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+       veor    @XMM[10], @XMM[6], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+       veor    @XMM[11], @XMM[7], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       sub     $rounds,$rounds,#1
+       b       .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+       &InvSbox        (@XMM[0..7, 8..15]);
+$code.=<<___;
+       subs    $rounds,$rounds,#1
+       bcc     .Ldec_done
+___
+       &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+       vldmia  $const, {@XMM[12]}              @ .LISR
+       ite     eq                              @ Thumb2 thing, sanity check in ARM
+       addeq   $const,$const,#0x10
+       bne     .Ldec_loop
+       vldmia  $const, {@XMM[12]}              @ .LISRM0
+       b       .Ldec_loop
+.align 4
+.Ldec_done:
+___
+       &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+       vldmia  $key, {@XMM[8]}                 @ last round key
+       veor    @XMM[6], @XMM[6], @XMM[8]
+       veor    @XMM[4], @XMM[4], @XMM[8]
+       veor    @XMM[2], @XMM[2], @XMM[8]
+       veor    @XMM[7], @XMM[7], @XMM[8]
+       veor    @XMM[3], @XMM[3], @XMM[8]
+       veor    @XMM[5], @XMM[5], @XMM[8]
+       veor    @XMM[0], @XMM[0], @XMM[8]
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       bx      lr
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type  _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:       @ InvShiftRows constants
+       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:                @ ShiftRows constants
+       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+       .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size  _bsaes_const,.-_bsaes_const
+
+.type  _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+       adr     $const,_bsaes_encrypt8
+       vldmia  $key!, {@XMM[9]}                @ round 0 key
+       sub     $const,$const,#_bsaes_encrypt8-.LM0SR
+
+       vldmia  $const!, {@XMM[8]}              @ .LM0SR
+_bsaes_encrypt8_alt:
+       veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+       veor    @XMM[11], @XMM[1], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+       veor    @XMM[12], @XMM[2], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+       veor    @XMM[13], @XMM[3], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+       veor    @XMM[14], @XMM[4], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+       veor    @XMM[15], @XMM[5], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+       veor    @XMM[10], @XMM[6], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+       veor    @XMM[11], @XMM[7], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       sub     $rounds,$rounds,#1
+       b       .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+       &Sbox           (@XMM[0..7, 8..15]);
+$code.=<<___;
+       subs    $rounds,$rounds,#1
+       bcc     .Lenc_done
+___
+       &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+       vldmia  $const, {@XMM[12]}              @ .LSR
+       ite     eq                              @ Thumb2 thing, samity check in ARM
+       addeq   $const,$const,#0x10
+       bne     .Lenc_loop
+       vldmia  $const, {@XMM[12]}              @ .LSRM0
+       b       .Lenc_loop
+.align 4
+.Lenc_done:
+___
+       # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+       &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+       vldmia  $key, {@XMM[8]}                 @ last round key
+       veor    @XMM[4], @XMM[4], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[8]
+       veor    @XMM[3], @XMM[3], @XMM[8]
+       veor    @XMM[7], @XMM[7], @XMM[8]
+       veor    @XMM[2], @XMM[2], @XMM[8]
+       veor    @XMM[5], @XMM[5], @XMM[8]
+       veor    @XMM[0], @XMM[0], @XMM[8]
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       bx      lr
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+       &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+       @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+       vmov    @x[2], @x[0]
+       vmov    @x[3], @x[1]
+___
+       #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+       &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+       @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+       vmov    @x[4], @x[0]
+       vmov    @x[6], @x[2]
+       vmov    @x[5], @x[1]
+       vmov    @x[7], @x[3]
+___
+       &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
+       &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type  _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+       adr     $const,_bsaes_key_convert
+       vld1.8  {@XMM[7]},  [$inp]!             @ load round 0 key
+       sub     $const,$const,#_bsaes_key_convert-.LM0
+       vld1.8  {@XMM[15]}, [$inp]!             @ load round 1 key
+
+       vmov.i8 @XMM[8],  #0x01                 @ bit masks
+       vmov.i8 @XMM[9],  #0x02
+       vmov.i8 @XMM[10], #0x04
+       vmov.i8 @XMM[11], #0x08
+       vmov.i8 @XMM[12], #0x10
+       vmov.i8 @XMM[13], #0x20
+       vldmia  $const, {@XMM[14]}              @ .LM0
+
+#ifdef __ARMEL__
+       vrev32.8        @XMM[7],  @XMM[7]
+       vrev32.8        @XMM[15], @XMM[15]
+#endif
+       sub     $rounds,$rounds,#1
+       vstmia  $out!, {@XMM[7]}                @ save round 0 key
+       b       .Lkey_loop
+
+.align 4
+.Lkey_loop:
+       vtbl.8  `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+       vtbl.8  `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+       vmov.i8 @XMM[6],  #0x40
+       vmov.i8 @XMM[15], #0x80
+
+       vtst.8  @XMM[0], @XMM[7], @XMM[8]
+       vtst.8  @XMM[1], @XMM[7], @XMM[9]
+       vtst.8  @XMM[2], @XMM[7], @XMM[10]
+       vtst.8  @XMM[3], @XMM[7], @XMM[11]
+       vtst.8  @XMM[4], @XMM[7], @XMM[12]
+       vtst.8  @XMM[5], @XMM[7], @XMM[13]
+       vtst.8  @XMM[6], @XMM[7], @XMM[6]
+       vtst.8  @XMM[7], @XMM[7], @XMM[15]
+       vld1.8  {@XMM[15]}, [$inp]!             @ load next round key
+       vmvn    @XMM[0], @XMM[0]                @ "pnot"
+       vmvn    @XMM[1], @XMM[1]
+       vmvn    @XMM[5], @XMM[5]
+       vmvn    @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+       vrev32.8        @XMM[15], @XMM[15]
+#endif
+       subs    $rounds,$rounds,#1
+       vstmia  $out!,{@XMM[0]-@XMM[7]}         @ write bit-sliced round key
+       bne     .Lkey_loop
+
+       vmov.i8 @XMM[7],#0x63                   @ compose .L63
+       @ don't save last round key
+       bx      lr
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) {               # following four functions are unsupported interface
+                       # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type  bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       ldr     r5,[$inp,#240]                  @ pass rounds
+       mov     r4,$inp                         @ pass key
+       mov     r12,$out                        @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type  bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Lenc128_loop:
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+       mov     r4,$key                         @ pass the key
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5,#10                          @ pass rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+
+       bl      _bsaes_encrypt8
+
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       subs    $len,$len,#0x80
+       vst1.8  {@XMM[5]}, [$out]!
+       bhi     .Lenc128_loop
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type  bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       ldr     r5,[$inp,#240]                  @ pass rounds
+       mov     r4,$inp                         @ pass key
+       mov     r12,$out                        @ pass key schedule
+       bl      _bsaes_key_convert
+       vldmia  $out, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  $out, {@XMM[7]}
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type  bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Ldec128_loop:
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+       mov     r4,$key                         @ pass the key
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5,#10                          @ pass rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+
+       bl      _bsaes_decrypt8
+
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       subs    $len,$len,#0x80
+       vst1.8  {@XMM[5]}, [$out]!
+       bhi     .Ldec128_loop
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global        bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef        __KERNEL__
+       cmp     $len, #128
+#ifndef        __thumb__
+       blo     AES_cbc_encrypt
+#else
+       bhs     1f
+       b       AES_cbc_encrypt
+1:
+#endif
+#endif
+
+       @ it is up to the caller to make sure we are called with enc == 0
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     $ivp, [ip]                      @ IV is 1st arg on the stack
+       mov     $len, $len, lsr#4               @ len in 16 byte blocks
+       sub     sp, #0x10                       @ scratch space to carry over the IV
+       mov     $fp, sp                         @ save sp
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       add     r12, #`128-32`                  @ sifze of bit-slices key schedule
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12                         @ sp is $keysched
+       bl      _bsaes_key_convert
+       vldmia  $keysched, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  $keysched, {@XMM[7]}
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, $key, #248
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12, {@XMM[15]}                 @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+       vld1.8  {@XMM[15]}, [$ivp]              @ load IV
+       b       .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+       subs    $len, $len, #0x8
+       bmi     .Lcbc_dec_loop_finish
+
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, $keysched                   @ pass the key
+#else
+       add     r4, $key, #248
+#endif
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5, $rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]
+       sub     $inp, $inp, #0x60
+       vstmia  $fp, {@XMM[15]}                 @ put aside IV
+
+       bl      _bsaes_decrypt8
+
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[14]-@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[3], @XMM[3], @XMM[13]
+       vst1.8  {@XMM[6]}, [$out]!
+       veor    @XMM[5], @XMM[5], @XMM[14]
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       vst1.8  {@XMM[5]}, [$out]!
+
+       b       .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+       adds    $len, $len, #8
+       beq     .Lcbc_dec_done
+
+       vld1.8  {@XMM[0]}, [$inp]!              @ load input
+       cmp     $len, #2
+       blo     .Lcbc_dec_one
+       vld1.8  {@XMM[1]}, [$inp]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, $keysched                   @ pass the key
+#else
+       add     r4, $key, #248
+#endif
+       mov     r5, $rounds
+       vstmia  $fp, {@XMM[15]}                 @ put aside IV
+       beq     .Lcbc_dec_two
+       vld1.8  {@XMM[2]}, [$inp]!
+       cmp     $len, #4
+       blo     .Lcbc_dec_three
+       vld1.8  {@XMM[3]}, [$inp]!
+       beq     .Lcbc_dec_four
+       vld1.8  {@XMM[4]}, [$inp]!
+       cmp     $len, #6
+       blo     .Lcbc_dec_five
+       vld1.8  {@XMM[5]}, [$inp]!
+       beq     .Lcbc_dec_six
+       vld1.8  {@XMM[6]}, [$inp]!
+       sub     $inp, $inp, #0x70
+
+       bl      _bsaes_decrypt8
+
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[3], @XMM[3], @XMM[13]
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+       sub     $inp, $inp, #0x60
+       bl      _bsaes_decrypt8
+       vldmia  $fp,{@XMM[14]}                  @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+       sub     $inp, $inp, #0x50
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+       sub     $inp, $inp, #0x40
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+       sub     $inp, $inp, #0x30
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+       sub     $inp, $inp, #0x20
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]}, [$inp]!              @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[15]}, [$inp]!             @ reload input
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+       sub     $inp, $inp, #0x10
+       mov     $rounds, $out                   @ save original out pointer
+       mov     $out, $fp                       @ use the iv scratch space as out buffer
+       mov     r2, $key
+       vmov    @XMM[4],@XMM[15]                @ just in case ensure that IV
+       vmov    @XMM[5],@XMM[0]                 @ and input are preserved
+       bl      AES_decrypt
+       vld1.8  {@XMM[0]}, [$fp,:64]            @ load result
+       veor    @XMM[0], @XMM[0], @XMM[4]       @ ^= IV
+       vmov    @XMM[15], @XMM[5]               @ @XMM[5] holds input
+       vst1.8  {@XMM[0]}, [$rounds]            @ write output
+
+.Lcbc_dec_done:
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+.Lcbc_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          $keysched!, {q0-q1}
+       cmp             $keysched, $fp
+       bne             .Lcbc_dec_bzero
+#endif
+
+       mov     sp, $fp
+       add     sp, #0x10                       @ add sp,$fp,#0x10 is no good for thumb
+       vst1.8  {@XMM[15]}, [$ivp]              @ return IV
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6";      # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern        AES_encrypt
+.global        bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+       cmp     $len, #8                        @ use plain AES for
+       blo     .Lctr_enc_short                 @ small sizes
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     $ctr, [ip]                      @ ctr is 1st arg on the stack
+       sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+       mov     $fp, sp                         @ save sp
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       add     r12, #`128-32`                  @ size of bit-sliced key schedule
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12                         @ sp is $keysched
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+       vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+       add     $ctr, $const, #.LREVM0SR-.LM0   @ borrow $ctr
+       vldmia  $keysched, {@XMM[4]}            @ load round0 key
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+.align 2
+0:     add     r12, $key, #248
+       vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+       adrl    $ctr, .LREVM0SR                 @ borrow $ctr
+       vldmia  r12, {@XMM[4]}                  @ load round0 key
+       sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+
+       vmov.i32        @XMM[8],#1              @ compose 1<<96
+       veor            @XMM[9],@XMM[9],@XMM[9]
+       vrev32.8        @XMM[0],@XMM[0]
+       vext.8          @XMM[8],@XMM[9],@XMM[8],#4
+       vrev32.8        @XMM[4],@XMM[4]
+       vadd.u32        @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+       vstmia  $keysched, {@XMM[4]}            @ save adjusted round0 key
+       b       .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+       vadd.u32        @XMM[10], @XMM[8], @XMM[9]      @ compose 3<<96
+       vadd.u32        @XMM[1], @XMM[0], @XMM[8]       @ +1
+       vadd.u32        @XMM[2], @XMM[0], @XMM[9]       @ +2
+       vadd.u32        @XMM[3], @XMM[0], @XMM[10]      @ +3
+       vadd.u32        @XMM[4], @XMM[1], @XMM[10]
+       vadd.u32        @XMM[5], @XMM[2], @XMM[10]
+       vadd.u32        @XMM[6], @XMM[3], @XMM[10]
+       vadd.u32        @XMM[7], @XMM[4], @XMM[10]
+       vadd.u32        @XMM[10], @XMM[5], @XMM[10]     @ next counter
+
+       @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+       @ to flip byte order in 32-bit counter
+
+       vldmia          $keysched, {@XMM[9]}            @ load round0 key
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, $keysched, #0x10            @ pass next round key
+#else
+       add             r4, $key, #`248+16`
+#endif
+       vldmia          $ctr, {@XMM[8]}                 @ .LREVM0SR
+       mov             r5, $rounds                     @ pass rounds
+       vstmia          $fp, {@XMM[10]}                 @ save next counter
+       sub             $const, $ctr, #.LREVM0SR-.LSR   @ pass constants
+
+       bl              _bsaes_encrypt8_alt
+
+       subs            $len, $len, #8
+       blo             .Lctr_enc_loop_done
+
+       vld1.8          {@XMM[8]-@XMM[9]}, [$inp]!      @ load input
+       vld1.8          {@XMM[10]-@XMM[11]}, [$inp]!
+       veor            @XMM[0], @XMM[8]
+       veor            @XMM[1], @XMM[9]
+       vld1.8          {@XMM[12]-@XMM[13]}, [$inp]!
+       veor            @XMM[4], @XMM[10]
+       veor            @XMM[6], @XMM[11]
+       vld1.8          {@XMM[14]-@XMM[15]}, [$inp]!
+       veor            @XMM[3], @XMM[12]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor            @XMM[7], @XMM[13]
+       veor            @XMM[2], @XMM[14]
+       vst1.8          {@XMM[4]}, [$out]!
+       veor            @XMM[5], @XMM[15]
+       vst1.8          {@XMM[6]}, [$out]!
+       vmov.i32        @XMM[8], #1                     @ compose 1<<96
+       vst1.8          {@XMM[3]}, [$out]!
+       veor            @XMM[9], @XMM[9], @XMM[9]
+       vst1.8          {@XMM[7]}, [$out]!
+       vext.8          @XMM[8], @XMM[9], @XMM[8], #4
+       vst1.8          {@XMM[2]}, [$out]!
+       vadd.u32        @XMM[9],@XMM[8],@XMM[8]         @ compose 2<<96
+       vst1.8          {@XMM[5]}, [$out]!
+       vldmia          $fp, {@XMM[0]}                  @ load counter
+
+       bne             .Lctr_enc_loop
+       b               .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+       add             $len, $len, #8
+       vld1.8          {@XMM[8]}, [$inp]!      @ load input
+       veor            @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!      @ write output
+       cmp             $len, #2
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[9]}, [$inp]!
+       veor            @XMM[1], @XMM[9]
+       vst1.8          {@XMM[1]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[10]}, [$inp]!
+       veor            @XMM[4], @XMM[10]
+       vst1.8          {@XMM[4]}, [$out]!
+       cmp             $len, #4
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[11]}, [$inp]!
+       veor            @XMM[6], @XMM[11]
+       vst1.8          {@XMM[6]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[12]}, [$inp]!
+       veor            @XMM[3], @XMM[12]
+       vst1.8          {@XMM[3]}, [$out]!
+       cmp             $len, #6
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[13]}, [$inp]!
+       veor            @XMM[7], @XMM[13]
+       vst1.8          {@XMM[7]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[14]}, [$inp]
+       veor            @XMM[2], @XMM[14]
+       vst1.8          {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifndef        BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                       @ wipe key schedule [if any]
+       vstmia          $keysched!, {q0-q1}
+       cmp             $keysched, $fp
+       bne             .Lctr_enc_bzero
+#else
+       vstmia          $keysched, {q0-q1}
+#endif
+
+       mov     sp, $fp
+       add     sp, #0x10               @ add sp,$fp,#0x10 is no good for thumb
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}       @ return
+
+.align 4
+.Lctr_enc_short:
+       ldr     ip, [sp]                @ ctr pointer is passed on stack
+       stmdb   sp!, {r4-r8, lr}
+
+       mov     r4, $inp                @ copy arguments
+       mov     r5, $out
+       mov     r6, $len
+       mov     r7, $key
+       ldr     r8, [ip, #12]           @ load counter LSW
+       vld1.8  {@XMM[1]}, [ip]         @ load whole counter value
+#ifdef __ARMEL__
+       rev     r8, r8
+#endif
+       sub     sp, sp, #0x10
+       vst1.8  {@XMM[1]}, [sp,:64]     @ copy counter value
+       sub     sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+       add     r0, sp, #0x10           @ input counter value
+       mov     r1, sp                  @ output on the stack
+       mov     r2, r7                  @ key
+
+       bl      AES_encrypt
+
+       vld1.8  {@XMM[0]}, [r4]!        @ load input
+       vld1.8  {@XMM[1]}, [sp,:64]     @ load encrypted counter
+       add     r8, r8, #1
+#ifdef __ARMEL__
+       rev     r0, r8
+       str     r0, [sp, #0x1c]         @ next counter value
+#else
+       str     r8, [sp, #0x1c]         @ next counter value
+#endif
+       veor    @XMM[0],@XMM[0],@XMM[1]
+       vst1.8  {@XMM[0]}, [r5]!        @ store output
+       subs    r6, r6, #1
+       bne     .Lctr_enc_short_loop
+
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+       vstmia          sp!, {q0-q1}
+
+       ldmia   sp!, {r4-r8, pc}
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2,
+#      const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6";                # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future $fp
+
+       mov     $inp, r0
+       mov     $out, r1
+       mov     $len, r2
+       mov     $key, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0,sp                           @ pointer to initial tweak
+#endif
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+       mov     $fp, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+       sub     r12, #`32+16`                   @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+       vstmia  r12, {@XMM[7]}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+
+       vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+       adr     $magic, .Lxts_magic
+
+       subs    $len, #0x80
+       blo     .Lxts_enc_short
+       b       .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+       vst1.64         {@XMM[15]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       veor            @XMM[8], @XMM[8], @T[0]
+       vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       veor            @XMM[7], @XMM[7], @XMM[15]
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[2], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       veor            @XMM[13], @XMM[5], @XMM[15]
+       vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       subs            $len, #0x80
+       bpl             .Lxts_enc_loop
+
+.Lxts_enc_short:
+       adds            $len, #0x70
+       bmi             .Lxts_enc_done
+
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+       subs            $len, #0x10
+       bmi             .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       sub             $len, #0x10
+       vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+
+       vld1.8          {@XMM[6]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vld1.64         {@XMM[14]}, [r0,:128]!
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[2], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       vst1.8          {@XMM[12]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+       vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[5], @XMM[5], @XMM[13]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+       .quad   1, 0x87
+
+.align 5
+.Lxts_enc_5:
+       vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[4], @XMM[4], @XMM[12]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       vst1.8          {@XMM[10]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+       vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[3], @XMM[3], @XMM[11]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+       vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[2], @XMM[2], @XMM[10]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       vld1.64         {@XMM[10]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       vst1.8          {@XMM[8]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+       vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[1], @XMM[1], @XMM[9]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                         @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!
+       mov             $fp, r4
+
+       vmov            @XMM[8], @XMM[9]                @ next round tweak
+
+.Lxts_enc_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            $len, #0x10
+       beq             .Lxts_enc_ret
+       sub             r6, $out, #0x10
+
+.Lxts_enc_steal:
+       ldrb            r0, [$inp], #1
+       ldrb            r1, [$out, #-0x10]
+       strb            r0, [$out, #-0x10]
+       strb            r1, [$out], #1
+
+       subs            $len, #1
+       bhi             .Lxts_enc_steal
+
+       vld1.8          {@XMM[0]}, [r6]
+       mov             r0, sp
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                 @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [r6]
+       mov             $fp, r4
+#endif
+
+.Lxts_enc_ret:
+       bic             r0, $fp, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_enc_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_enc_bzero
+
+       mov             sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {@XMM[8]}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future $fp
+
+       mov     $inp, r0
+       mov     $out, r1
+       mov     $len, r2
+       mov     $key, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0, sp                          @ pointer to initial tweak
+#endif
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+       mov     $fp, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+       sub     r12, #`32+16`                   @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, sp, #0x90
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, $key, #248
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+       vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+       adr     $magic, .Lxts_magic
+
+       tst     $len, #0xf                      @ if not multiple of 16
+       it      ne                              @ Thumb2 thing, sanity check in ARM
+       subne   $len, #0x10                     @ subtract another 16 bytes
+       subs    $len, #0x80
+
+       blo     .Lxts_dec_short
+       b       .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+       vst1.64         {@XMM[15]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       veor            @XMM[8], @XMM[8], @T[0]
+       vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       veor            @XMM[7], @XMM[7], @XMM[15]
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[3], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       veor            @XMM[13], @XMM[5], @XMM[15]
+       vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       subs            $len, #0x80
+       bpl             .Lxts_dec_loop
+
+.Lxts_dec_short:
+       adds            $len, #0x70
+       bmi             .Lxts_dec_done
+
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+       subs            $len, #0x10
+       bmi             .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       sub             $len, #0x10
+       vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+
+       vld1.8          {@XMM[6]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vld1.64         {@XMM[14]}, [r0,:128]!
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[3], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       vst1.8          {@XMM[12]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+       vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[5], @XMM[5], @XMM[13]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+       vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[4], @XMM[4], @XMM[12]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       vst1.8          {@XMM[10]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+       vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[3], @XMM[3], @XMM[11]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+       vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[2], @XMM[2], @XMM[10]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       vld1.64         {@XMM[10]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       vst1.8          {@XMM[8]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+       vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[1], @XMM[1], @XMM[9]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                         @ preserve fp
+       mov             r5, $magic                      @ preserve magic
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!
+       mov             $fp, r4
+       mov             $magic, r5
+
+       vmov            @XMM[8], @XMM[9]                @ next round tweak
+
+.Lxts_dec_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            $len, #0x10
+       beq             .Lxts_dec_ret
+
+       @ calculate one round of extra tweak for the stolen ciphertext
+       vldmia          $magic, {$twmask}
+       vshr.s64        @XMM[6], @XMM[8], #63
+       vand            @XMM[6], @XMM[6], $twmask
+       vadd.u64        @XMM[9], @XMM[8], @XMM[8]
+       vswp            `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+       veor            @XMM[9], @XMM[9], @XMM[6]
+
+       @ perform the final decryption with the last tweak value
+       vld1.8          {@XMM[0]}, [$inp]!
+       mov             r0, sp
+       veor            @XMM[0], @XMM[0], @XMM[9]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                 @ preserve fp
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[9]
+       vst1.8          {@XMM[0]}, [$out]
+
+       mov             r6, $out
+.Lxts_dec_steal:
+       ldrb            r1, [$out]
+       ldrb            r0, [$inp], #1
+       strb            r1, [$out, #0x10]
+       strb            r0, [$out], #1
+
+       subs            $len, #1
+       bhi             .Lxts_dec_steal
+
+       vld1.8          {@XMM[0]}, [r6]
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [r6]
+       mov             $fp, r4
+#endif
+
+.Lxts_dec_ret:
+       bic             r0, $fp, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_dec_bzero
+
+       mov             sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {@XMM[8]}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+        last if (!s/^#/@/ and !/^$/);
+        print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild

index 1a7024b413511d6742f6fb928407603c648c77e3..c38b58c8020215f54af8140e97de8467ad3c3c16 100644 (file)
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
  generic-y += serial.h
  generic-y += shmbuf.h
  generic-y += siginfo.h
+generic-y += simd.h
  generic-y += sizes.h
  generic-y += socket.h
  generic-y += sockios.h
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h

index fcc1b5bf6979cafd4850ff88b3cf8d21bb6791a5..5c228516057552b6eca4f6f5a54f2a3f9733fd09 100644 (file)
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -53,6 +53,13 @@
  #define put_byte_3      lsl #0
  #endif
  
+/* Select code for any configuration running in BE8 mode */
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define ARM_BE8(code...) code
+#else
+#define ARM_BE8(code...)
+#endif
+
  /*
   * Data preload for architectures that support it
   */
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h

index da1c77d39327963ab10e633aeb8809aac7da2dec..62d2cb53b06935aed4430bf33c6a127239801ff1 100644 (file)
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -12,6 +12,7 @@
  #define __ASM_ARM_ATOMIC_H
  
  #include <linux/compiler.h>
+#include <linux/prefetch.h>
  #include <linux/types.h>
  #include <linux/irqflags.h>
  #include <asm/barrier.h>
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
         unsigned long tmp;
         int result;
  
+       prefetchw(&v->counter);
         __asm__ __volatile__("@ atomic_add\n"
  "1:    ldrex   %0, [%3]\n"
  "      add     %0, %0, %4\n"
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
         unsigned long tmp;
         int result;
  
+       prefetchw(&v->counter);
         __asm__ __volatile__("@ atomic_sub\n"
  "1:    ldrex   %0, [%3]\n"
  "      sub     %0, %0, %4\n"
@@ -114,7 +117,8 @@ static inline int atomic_sub_return(int i, atomic_t *v)
  
  static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
  {
-       unsigned long oldval, res;
+       int oldval;
+       unsigned long res;
  
         smp_mb();
  
@@ -134,21 +138,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
         return oldval;
  }
  
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long tmp, tmp2;
-
-       __asm__ __volatile__("@ atomic_clear_mask\n"
-"1:    ldrex   %0, [%3]\n"
-"      bic     %0, %0, %4\n"
-"      strex   %1, %0, [%3]\n"
-"      teq     %1, #0\n"
-"      bne     1b"
-       : "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr)
-       : "r" (addr), "Ir" (mask)
-       : "cc");
-}
-
  #else /* ARM_ARCH_6 */
  
  #ifdef CONFIG_SMP
@@ -197,15 +186,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
         return ret;
  }
  
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long flags;
-
-       raw_local_irq_save(flags);
-       *addr &= ~mask;
-       raw_local_irq_restore(flags);
-}
-
  #endif /* __LINUX_ARM_ARCH__ */
  
  #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -238,15 +218,15 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  
  #ifndef CONFIG_GENERIC_ATOMIC64
  typedef struct {
-       u64 __aligned(8) counter;
+       long long counter;
  } atomic64_t;
  
  #define ATOMIC64_INIT(i) { (i) }
  
  #ifdef CONFIG_ARM_LPAE
-static inline u64 atomic64_read(const atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
  {
-       u64 result;
+       long long result;
  
         __asm__ __volatile__("@ atomic64_read\n"
  "      ldrd    %0, %H0, [%1]"
@@ -257,7 +237,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
         return result;
  }
  
-static inline void atomic64_set(atomic64_t *v, u64 i)
+static inline void atomic64_set(atomic64_t *v, long long i)
  {
         __asm__ __volatile__("@ atomic64_set\n"
  "      strd    %2, %H2, [%1]"
@@ -266,9 +246,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
         );
  }
  #else
-static inline u64 atomic64_read(const atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
  {
-       u64 result;
+       long long result;
  
         __asm__ __volatile__("@ atomic64_read\n"
  "      ldrexd  %0, %H0, [%1]"
@@ -279,10 +259,11 @@ static inline u64 atomic64_read(const atomic64_t *v)
         return result;
  }
  
-static inline void atomic64_set(atomic64_t *v, u64 i)
+static inline void atomic64_set(atomic64_t *v, long long i)
  {
-       u64 tmp;
+       long long tmp;
  
+       prefetchw(&v->counter);
         __asm__ __volatile__("@ atomic64_set\n"
  "1:    ldrexd  %0, %H0, [%2]\n"
  "      strexd  %0, %3, %H3, [%2]\n"
@@ -294,15 +275,16 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
  }
  #endif
  
-static inline void atomic64_add(u64 i, atomic64_t *v)
+static inline void atomic64_add(long long i, atomic64_t *v)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
+       prefetchw(&v->counter);
         __asm__ __volatile__("@ atomic64_add\n"
  "1:    ldrexd  %0, %H0, [%3]\n"
-"      adds    %0, %0, %4\n"
-"      adc     %H0, %H0, %H4\n"
+"      adds    %Q0, %Q0, %Q4\n"
+"      adc     %R0, %R0, %R4\n"
  "      strexd  %1, %0, %H0, [%3]\n"
  "      teq     %1, #0\n"
  "      bne     1b"
@@ -311,17 +293,17 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
         : "cc");
  }
  
-static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
         smp_mb();
  
         __asm__ __volatile__("@ atomic64_add_return\n"
  "1:    ldrexd  %0, %H0, [%3]\n"
-"      adds    %0, %0, %4\n"
-"      adc     %H0, %H0, %H4\n"
+"      adds    %Q0, %Q0, %Q4\n"
+"      adc     %R0, %R0, %R4\n"
  "      strexd  %1, %0, %H0, [%3]\n"
  "      teq     %1, #0\n"
  "      bne     1b"
@@ -334,15 +316,16 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
         return result;
  }
  
-static inline void atomic64_sub(u64 i, atomic64_t *v)
+static inline void atomic64_sub(long long i, atomic64_t *v)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
+       prefetchw(&v->counter);
         __asm__ __volatile__("@ atomic64_sub\n"
  "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, %4\n"
-"      sbc     %H0, %H0, %H4\n"
+"      subs    %Q0, %Q0, %Q4\n"
+"      sbc     %R0, %R0, %R4\n"
  "      strexd  %1, %0, %H0, [%3]\n"
  "      teq     %1, #0\n"
  "      bne     1b"
@@ -351,17 +334,17 @@ static inline void atomic64_sub(u64 i, atomic64_t *v)
         : "cc");
  }
  
-static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
         smp_mb();
  
         __asm__ __volatile__("@ atomic64_sub_return\n"
  "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, %4\n"
-"      sbc     %H0, %H0, %H4\n"
+"      subs    %Q0, %Q0, %Q4\n"
+"      sbc     %R0, %R0, %R4\n"
  "      strexd  %1, %0, %H0, [%3]\n"
  "      teq     %1, #0\n"
  "      bne     1b"
@@ -374,9 +357,10 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
         return result;
  }
  
-static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
+static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
+                                       long long new)
  {
-       u64 oldval;
+       long long oldval;
         unsigned long res;
  
         smp_mb();
@@ -398,9 +382,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
         return oldval;
  }
  
-static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
+static inline long long atomic64_xchg(atomic64_t *ptr, long long new)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
         smp_mb();
@@ -419,18 +403,18 @@ static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
         return result;
  }
  
-static inline u64 atomic64_dec_if_positive(atomic64_t *v)
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
  {
-       u64 result;
+       long long result;
         unsigned long tmp;
  
         smp_mb();
  
         __asm__ __volatile__("@ atomic64_dec_if_positive\n"
  "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, #1\n"
-"      sbc     %H0, %H0, #0\n"
-"      teq     %H0, #0\n"
+"      subs    %Q0, %Q0, #1\n"
+"      sbc     %R0, %R0, #0\n"
+"      teq     %R0, #0\n"
  "      bmi     2f\n"
  "      strexd  %1, %0, %H0, [%3]\n"
  "      teq     %1, #0\n"
@@ -445,9 +429,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v)
         return result;
  }
  
-static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
  {
-       u64 val;
+       long long val;
         unsigned long tmp;
         int ret = 1;
  
@@ -459,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
  "      teqeq   %H0, %H5\n"
  "      moveq   %1, #0\n"
  "      beq     2f\n"
-"      adds    %0, %0, %6\n"
-"      adc     %H0, %H0, %H6\n"
+"      adds    %Q0, %Q0, %Q6\n"
+"      adc     %R0, %R0, %R6\n"
  "      strexd  %2, %0, %H0, [%4]\n"
  "      teq     %2, #0\n"
  "      bne     1b\n"
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h

new file mode 100644 (file)

index 0000000..1714800
--- /dev/null
+++ b/arch/arm/include/asm/bL_switcher.h
@@ -0,0 +1,77 @@
+/*
+ * arch/arm/include/asm/bL_switcher.h
+ *
+ * Created by:  Nicolas Pitre, April 2012
+ * Copyright:   (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_BL_SWITCHER_H
+#define ASM_BL_SWITCHER_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+typedef void (*bL_switch_completion_handler)(void *cookie);
+
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                        bL_switch_completion_handler completer,
+                        void *completer_cookie);
+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
+{
+       return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
+}
+
+/*
+ * Register here to be notified about runtime enabling/disabling of
+ * the switcher.
+ *
+ * The notifier chain is called with the switcher activation lock held:
+ * the switcher will not be enabled or disabled during callbacks.
+ * Callbacks must not call bL_switcher_{get,put}_enabled().
+ */
+#define BL_NOTIFY_PRE_ENABLE   0
+#define BL_NOTIFY_POST_ENABLE  1
+#define BL_NOTIFY_PRE_DISABLE  2
+#define BL_NOTIFY_POST_DISABLE 3
+
+#ifdef CONFIG_BL_SWITCHER
+
+int bL_switcher_register_notifier(struct notifier_block *nb);
+int bL_switcher_unregister_notifier(struct notifier_block *nb);
+
+/*
+ * Use these functions to temporarily prevent enabling/disabling of
+ * the switcher.
+ * bL_switcher_get_enabled() returns true if the switcher is currently
+ * enabled.  Each call to bL_switcher_get_enabled() must be followed
+ * by a call to bL_switcher_put_enabled().  These functions are not
+ * recursive.
+ */
+bool bL_switcher_get_enabled(void);
+void bL_switcher_put_enabled(void);
+
+int bL_switcher_trace_trigger(void);
+int bL_switcher_get_logical_index(u32 mpidr);
+
+#else
+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline bool bL_switcher_get_enabled(void) { return false; }
+static inline void bL_switcher_put_enabled(void) { }
+static inline int bL_switcher_trace_trigger(void) { return 0; }
+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
+#endif /* CONFIG_BL_SWITCHER */
+
+#endif
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h

index 7af5c6c3653a8061bad62211bbd3073af82ab3f0..b274bde24905a7503f60d38672346636d093854b 100644 (file)
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -2,6 +2,8 @@
  #define _ASMARM_BUG_H
  
  #include <linux/linkage.h>
+#include <linux/types.h>
+#include <asm/opcodes.h>
  
  #ifdef CONFIG_BUG
  
@@ -12,10 +14,10 @@
   */
  #ifdef CONFIG_THUMB2_KERNEL
  #define BUG_INSTR_VALUE 0xde02
-#define BUG_INSTR_TYPE ".hword "
+#define BUG_INSTR(__value) __inst_thumb16(__value)
  #else
  #define BUG_INSTR_VALUE 0xe7f001f2
-#define BUG_INSTR_TYPE ".word "
+#define BUG_INSTR(__value) __inst_arm(__value)
  #endif
  
  
@@ -33,7 +35,7 @@
  
  #define __BUG(__file, __line, __value)                         \
  do {                                                           \
-       asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n"        \
+       asm volatile("1:\t" BUG_INSTR(__value) "\n"  \
                 ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
                 "2:\t.asciz " #__file "\n"                      \
                 ".popsection\n"                                 \
@@ -48,7 +50,7 @@ do {                                                          \
  
  #define __BUG(__file, __line, __value)                         \
  do {                                                           \
-       asm volatile(BUG_INSTR_TYPE #__value);                  \
+       asm volatile(BUG_INSTR(__value) "\n");                  \
         unreachable();                                          \
  } while (0)
  #endif  /* CONFIG_DEBUG_BUGVERBOSE */
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index 15f2d5bf8875636e1514d377fc61b426534b3f74..ee753f1749cd795b03557273afdb3d0832d6fc8e 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -435,4 +435,50 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
  #define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr))
  #define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr))
  
+/*
+ * Disabling cache access for one CPU in an ARMv7 SMP system is tricky.
+ * To do so we must:
+ *
+ * - Clear the SCTLR.C bit to prevent further cache allocations
+ * - Flush the desired level of cache
+ * - Clear the ACTLR "SMP" bit to disable local coherency
+ *
+ * ... and so without any intervening memory access in between those steps,
+ * not even to the stack.
+ *
+ * WARNING -- After this has been called:
+ *
+ * - No ldrex/strex (and similar) instructions must be used.
+ * - The CPU is obviously no longer coherent with the other CPUs.
+ * - This is unlikely to work as expected if Linux is running non-secure.
+ *
+ * Note:
+ *
+ * - This is known to apply to several ARMv7 processor implementations,
+ *   however some exceptions may exist.  Caveat emptor.
+ *
+ * - The clobber list is dictated by the call to v7_flush_dcache_*.
+ *   fp is preserved to the stack explicitly prior disabling the cache
+ *   since adding it to the clobber list is incompatible with having
+ *   CONFIG_FRAME_POINTER=y.  ip is saved as well if ever r12-clobbering
+ *   trampoline are inserted by the linker and to keep sp 64-bit aligned.
+ */
+#define v7_exit_coherency_flush(level) \
+       asm volatile( \
+       "stmfd  sp!, {fp, ip} \n\t" \
+       "mrc    p15, 0, r0, c1, c0, 0   @ get SCTLR \n\t" \
+       "bic    r0, r0, #"__stringify(CR_C)" \n\t" \
+       "mcr    p15, 0, r0, c1, c0, 0   @ set SCTLR \n\t" \
+       "isb    \n\t" \
+       "bl     v7_flush_dcache_"__stringify(level)" \n\t" \
+       "clrex  \n\t" \
+       "mrc    p15, 0, r0, c1, c0, 1   @ get ACTLR \n\t" \
+       "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t" \
+       "mcr    p15, 0, r0, c1, c0, 1   @ set ACTLR \n\t" \
+       "isb    \n\t" \
+       "dsb    \n\t" \
+       "ldmfd  sp!, {fp, ip}" \
+       : : : "r0","r1","r2","r3","r4","r5","r6","r7", \
+             "r9","r10","lr","memory" )
+
  #endif
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h

index 4f009c10540dff2a2e7efd08b0671c2369547b90..df2fbba7efc80d57074a6053704a9c70119aae03 100644 (file)
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -223,6 +223,42 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
         return ret;
  }
  
+static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
+                                            unsigned long long old,
+                                            unsigned long long new)
+{
+       unsigned long long oldval;
+       unsigned long res;
+
+       __asm__ __volatile__(
+"1:    ldrexd          %1, %H1, [%3]\n"
+"      teq             %1, %4\n"
+"      teqeq           %H1, %H4\n"
+"      bne             2f\n"
+"      strexd          %0, %5, %H5, [%3]\n"
+"      teq             %0, #0\n"
+"      bne             1b\n"
+"2:"
+       : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
+       : "r" (ptr), "r" (old), "r" (new)
+       : "cc");
+
+       return oldval;
+}
+
+static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
+                                               unsigned long long old,
+                                               unsigned long long new)
+{
+       unsigned long long ret;
+
+       smp_mb();
+       ret = __cmpxchg64(ptr, old, new);
+       smp_mb();
+
+       return ret;
+}
+
  #define cmpxchg_local(ptr,o,n)                                         \
         ((__typeof__(*(ptr)))__cmpxchg_local((ptr),                     \
                                        (unsigned long)(o),              \
@@ -230,18 +266,16 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
                                        sizeof(*(ptr))))
  
  #define cmpxchg64(ptr, o, n)                                           \
-       ((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr),       \
-                                               atomic64_t,             \
-                                               counter),               \
-                                             (unsigned long long)(o),  \
-                                             (unsigned long long)(n)))
-
-#define cmpxchg64_local(ptr, o, n)                                     \
-       ((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr),        \
-                                               local64_t,              \
-                                               a),                     \
-                                            (unsigned long long)(o),   \
-                                            (unsigned long long)(n)))
+       ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr),                      \
+                                       (unsigned long long)(o),        \
+                                       (unsigned long long)(n)))
+
+#define cmpxchg64_relaxed(ptr, o, n)                                   \
+       ((__typeof__(*(ptr)))__cmpxchg64((ptr),                         \
+                                       (unsigned long long)(o),        \
+                                       (unsigned long long)(n)))
+
+#define cmpxchg64_local(ptr, o, n)     cmpxchg64_relaxed((ptr), (o), (n))
  
  #endif /* __LINUX_ARM_ARCH__ >= 6 */
  
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h

index 9672e978d50df67d94c3dd86d23f3bcdd187c54d..acdde76b39bbae3064034fff78b9dd2b95bbd39c 100644 (file)
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -10,6 +10,7 @@
  #define CPUID_TLBTYPE  3
  #define CPUID_MPUIR    4
  #define CPUID_MPIDR    5
+#define CPUID_REVIDR   6
  
  #ifdef CONFIG_CPU_V7M
  #define CPUID_EXT_PFR0 0x40
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h

index 2740c2a2df639361617f6fe484ead14f8625eaf2..fe3ea776dc34267724f377465134e52b39434fed 100644 (file)
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
  #include <linux/threads.h>
  #include <asm/irq.h>
  
-#define NR_IPI 6
+#define NR_IPI 8
  
  typedef struct {
         unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h

index 0cf7a6b842ff4ad40847394ea5ff8fd77bce29e7..ad774f37c47cda0f6201d7a7aad7d967cec4786b 100644 (file)
--- a/arch/arm/include/asm/hardware/coresight.h
+++ b/arch/arm/include/asm/hardware/coresight.h
@@ -24,8 +24,8 @@
  #define TRACER_TIMEOUT 10000
  
  #define etm_writel(t, v, x) \
-       (__raw_writel((v), (t)->etm_regs + (x)))
-#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x)))
+       (writel_relaxed((v), (t)->etm_regs + (x)))
+#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
  
  /* CoreSight Management Registers */
  #define CSMR_LOCKACCESS 0xfb0
@@ -142,8 +142,8 @@
  #define ETBFF_TRIGFL           BIT(10)
  
  #define etb_writel(t, v, x) \
-       (__raw_writel((v), (t)->etb_regs + (x)))
-#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x)))
+       (writel_relaxed((v), (t)->etb_regs + (x)))
+#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
  
  #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
  #define etm_unlock(t) \
diff --git a/arch/arm/include/asm/kgdb.h b/arch/arm/include/asm/kgdb.h

index 48066ce9ea34f64961111c487fbc4f26e037b842..0a9d5dd932941a1f0635574904ecd47259b87295 100644 (file)
--- a/arch/arm/include/asm/kgdb.h
+++ b/arch/arm/include/asm/kgdb.h
@@ -11,6 +11,7 @@
  #define __ARM_KGDB_H__
  
  #include <linux/ptrace.h>
+#include <asm/opcodes.h>
  
  /*
   * GDB assumes that we're a user process being debugged, so
@@ -41,7 +42,7 @@
  
  static inline void arch_kgdb_breakpoint(void)
  {
-       asm(".word 0xe7ffdeff");
+       asm(__inst_arm(0xe7ffdeff));
  }
  
  extern void kgdb_handle_bus_error(void);
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h

index 402a2bc6aa687b94b09af6efa039c58fb80436d2..17a3fa2979e8ae5c5a56f88eda448f635c3e132c 100644 (file)
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -49,6 +49,7 @@ struct machine_desc {
         bool                    (*smp_init)(void);
         void                    (*fixup)(struct tag *, char **,
                                          struct meminfo *);
+       void                    (*init_meminfo)(void);
         void                    (*reserve)(void);/* reserve mem blocks  */
         void                    (*map_io)(void);/* IO mapping function  */
         void                    (*init_early)(void);
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h

index fc82a88f5b69e556e9235583364b5149b2572b66..608516ebabfe6111a651f3a5ca6e63c607046fab 100644 (file)
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -41,6 +41,14 @@ extern void mcpm_entry_point(void);
   */
  void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
  
+/*
+ * This sets an early poke i.e a value to be poked into some address
+ * from very early assembly code before the CPU is ungated.  The
+ * address must be physical, and if 0 then nothing will happen.
+ */
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                        unsigned long poke_phys_addr, unsigned long poke_val);
+
  /*
   * CPU/cluster power operations API for higher subsystems to use.
   */
@@ -81,9 +89,39 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster);
   *
   * This will return if mcpm_platform_register() has not been called
   * previously in which case the caller should take appropriate action.
+ *
+ * On success, the CPU is not guaranteed to be truly halted until
+ * mcpm_cpu_power_down_finish() subsequently returns non-zero for the
+ * specified cpu.  Until then, other CPUs should make sure they do not
+ * trash memory the target CPU might be executing/accessing.
   */
  void mcpm_cpu_power_down(void);
  
+/**
+ * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and
+ *     make sure it is powered off
+ *
+ * @cpu: CPU number within given cluster
+ * @cluster: cluster number for the CPU
+ *
+ * Call this function to ensure that a pending powerdown has taken
+ * effect and the CPU is safely parked before performing non-mcpm
+ * operations that may affect the CPU (such as kexec trashing the
+ * kernel text).
+ *
+ * It is *not* necessary to call this function if you only need to
+ * serialise a pending powerdown with mcpm_cpu_power_up() or a wakeup
+ * event.
+ *
+ * Do not call this function unless the specified CPU has already
+ * called mcpm_cpu_power_down() or has committed to doing so.
+ *
+ * @return:
+ *     - zero if the CPU is in a safely parked state
+ *     - nonzero otherwise (e.g., timeout)
+ */
+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster);
+
  /**
   * mcpm_cpu_suspend - bring the calling CPU in a suspended state
   *
@@ -126,6 +164,7 @@ int mcpm_cpu_powered_up(void);
  struct mcpm_platform_ops {
         int (*power_up)(unsigned int cpu, unsigned int cluster);
         void (*power_down)(void);
+       int (*power_down_finish)(unsigned int cpu, unsigned int cluster);
         void (*suspend)(u64);
         void (*powered_up)(void);
  };
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

index e750a938fd3ce283ccd351cb73ed9c3d8df84e75..4dd21457ef9d2be8b1c94cac7eea97c8ef8cc1f6 100644 (file)
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -172,8 +172,13 @@
   * so that all we need to do is modify the 8-bit constant field.
   */
  #define __PV_BITS_31_24        0x81000000
+#define __PV_BITS_7_0  0x81
+
+extern u64 __pv_phys_offset;
+extern u64 __pv_offset;
+extern void fixup_pv_table(const void *, unsigned long);
+extern const void *__pv_table_begin, *__pv_table_end;
  
-extern unsigned long __pv_phys_offset;
  #define PHYS_OFFSET __pv_phys_offset
  
  #define __pv_stub(from,to,instr,type)                  \
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
         : "=r" (to)                                     \
         : "r" (from), "I" (type))
  
-static inline unsigned long __virt_to_phys(unsigned long x)
+#define __pv_stub_mov_hi(t)                            \
+       __asm__ volatile("@ __pv_stub_mov\n"            \
+       "1:     mov     %R0, %1\n"                      \
+       "       .pushsection .pv_table,\"a\"\n"         \
+       "       .long   1b\n"                           \
+       "       .popsection\n"                          \
+       : "=r" (t)                                      \
+       : "I" (__PV_BITS_7_0))
+
+#define __pv_add_carry_stub(x, y)                      \
+       __asm__ volatile("@ __pv_add_carry_stub\n"      \
+       "1:     adds    %Q0, %1, %2\n"                  \
+       "       adc     %R0, %R0, #0\n"                 \
+       "       .pushsection .pv_table,\"a\"\n"         \
+       "       .long   1b\n"                           \
+       "       .popsection\n"                          \
+       : "+r" (y)                                      \
+       : "r" (x), "I" (__PV_BITS_31_24)                \
+       : "cc")
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
  {
-       unsigned long t;
-       __pv_stub(x, t, "add", __PV_BITS_31_24);
+       phys_addr_t t;
+
+       if (sizeof(phys_addr_t) == 4) {
+               __pv_stub(x, t, "add", __PV_BITS_31_24);
+       } else {
+               __pv_stub_mov_hi(t);
+               __pv_add_carry_stub(x, t);
+       }
         return t;
  }
  
-static inline unsigned long __phys_to_virt(unsigned long x)
+static inline unsigned long __phys_to_virt(phys_addr_t x)
  {
         unsigned long t;
         __pv_stub(x, t, "sub", __PV_BITS_31_24);
         return t;
  }
+
  #else
-#define __virt_to_phys(x)      ((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x)      ((x) - PHYS_OFFSET + PAGE_OFFSET)
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
+{
+       return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+}
+
+static inline unsigned long __phys_to_virt(phys_addr_t x)
+{
+       return x - PHYS_OFFSET + PAGE_OFFSET;
+}
+
  #endif
  #endif
  #endif /* __ASSEMBLY__ */
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
  
  static inline void *phys_to_virt(phys_addr_t x)
  {
-       return (void *)(__phys_to_virt((unsigned long)(x)));
+       return (void *)__phys_to_virt(x);
  }
  
  /*
   * Drivers should NOT use these either.
   */
  #define __pa(x)                        __virt_to_phys((unsigned long)(x))
-#define __va(x)                        ((void *)__phys_to_virt((unsigned long)(x)))
+#define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
  #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
  
+extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
+
+/*
+ * These are for systems that have a hardware interconnect supported alias of
+ * physical memory for idmap purposes.  Most cases should leave these
+ * untouched.
+ */
+static inline phys_addr_t __virt_to_idmap(unsigned long x)
+{
+       if (arch_virt_to_idmap)
+               return arch_virt_to_idmap(x);
+       else
+               return __virt_to_phys(x);
+}
+
+#define virt_to_idmap(x)       __virt_to_idmap((unsigned long)(x))
+
  /*
   * Virtual <-> DMA view memory address translations
   * Again, these are *only* valid on the kernel direct mapped RAM
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h

index 6f18da09668b5f324ad8c32e6fd6058e5b837e2a..64fd15159b7de710b81a47c0aa9d6bab3af1d691 100644 (file)
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -16,7 +16,7 @@ typedef struct {
  #ifdef CONFIG_CPU_HAS_ASID
  #define ASID_BITS      8
  #define ASID_MASK      ((~0ULL) << ASID_BITS)
-#define ASID(mm)       ((mm)->context.id.counter & ~ASID_MASK)
+#define ASID(mm)       ((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
  #else
  #define ASID(mm)       (0)
  #endif
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h

index f97ee02386ee063ba12b78786c9a6cd8a4106676..86a659a19526c75a2ba3b91ce839925cc52d26de 100644 (file)
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -181,6 +181,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
  
  #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
  
+/*
+ * We don't have huge page support for short descriptors, for the moment
+ * define empty stubs for use by pin_page_for_write.
+ */
+#define pmd_hugewillfault(pmd) (0)
+#define pmd_thp_or_huge(pmd)   (0)
+
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_PGTABLE_2LEVEL_H */
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h

index 5689c18c85f5ebafb95a9bb2cc99fda227992976..39c54cfa03e9b103ef39982a43bac74d5436b791 100644 (file)
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
  #define __HAVE_ARCH_PMD_WRITE
  #define pmd_write(pmd)         (!(pmd_val(pmd) & PMD_SECT_RDONLY))
  
+#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
+#define pmd_thp_or_huge(pmd)   (pmd_huge(pmd) || pmd_trans_huge(pmd))
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
  #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h

index 413f3876341cd6fd2e7bc4b1c6a71873cadaa887..c3d5fc124a054c6309ffacdb2845ff22fd5bfa56 100644 (file)
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -22,6 +22,7 @@
  #include <asm/hw_breakpoint.h>
  #include <asm/ptrace.h>
  #include <asm/types.h>
+#include <asm/unified.h>
  
  #ifdef __KERNEL__
  #define STACK_TOP      ((current->personality & ADDR_LIMIT_32BIT) ? \
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
  #define KSTK_EIP(tsk)  task_pt_regs(tsk)->ARM_pc
  #define KSTK_ESP(tsk)  task_pt_regs(tsk)->ARM_sp
  
+#ifdef CONFIG_SMP
+#define __ALT_SMP_ASM(smp, up)                                         \
+       "9998:  " smp "\n"                                              \
+       "       .pushsection \".alt.smp.init\", \"a\"\n"                \
+       "       .long   9998b\n"                                        \
+       "       " up "\n"                                               \
+       "       .popsection\n"
+#else
+#define __ALT_SMP_ASM(smp, up) up
+#endif
+
  /*
   * Prefetching support - only ARMv5.
   */
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
  {
         __asm__ __volatile__(
                 "pld\t%a0"
-               :
-               : "p" (ptr)
-               : "cc");
+               :: "p" (ptr));
  }
  
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
  #define ARCH_HAS_PREFETCHW
-#define prefetchw(ptr) prefetch(ptr)
-
-#define ARCH_HAS_SPINLOCK_PREFETCH
-#define spin_lock_prefetch(x) do { } while (0)
-
+static inline void prefetchw(const void *ptr)
+{
+       __asm__ __volatile__(
+               ".arch_extension        mp\n"
+               __ALT_SMP_ASM(
+                       WASM(pldw)              "\t%a0",
+                       WASM(pld)               "\t%a0"
+               )
+               :: "p" (ptr));
+}
+#endif
  #endif
  
  #define HAVE_ARCH_PICK_MMAP_LAYOUT
diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h

index c50f0560950110b9f60d21f9ea9647be4ee77688..8d6a089dfb7628fe166f3757cd8aeba3c8270f0d 100644 (file)
--- a/arch/arm/include/asm/setup.h
+++ b/arch/arm/include/asm/setup.h
@@ -49,7 +49,7 @@ extern struct meminfo meminfo;
  #define bank_phys_end(bank)    ((bank)->start + (bank)->size)
  #define bank_phys_size(bank)   (bank)->size
  
-extern int arm_add_memory(phys_addr_t start, phys_addr_t size);
+extern int arm_add_memory(u64 start, u64 size);
  extern void early_print(const char *str, ...);
  extern void dump_machine_table(void);
  
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h

index a8cae71caceb3fb89c1ec949063b7a0d621dbdca..22a3b9b5d4a16fd4ece50bdfc83859f6ea38352f 100644 (file)
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
  extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
  extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
  
+extern int register_ipi_completion(struct completion *completion, int cpu);
+
  struct smp_operations {
  #ifdef CONFIG_SMP
         /*
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h

index 4f2c28060c9aa227c47e73ac6557c45add91128f..ef3c6072aa45345ae4594f22aebbe9a9ebc538f1 100644 (file)
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -5,21 +5,13 @@
  #error SMP not supported on pre-ARMv6 CPUs
  #endif
  
-#include <asm/processor.h>
+#include <linux/prefetch.h>
  
  /*
   * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
   * extensions, so when running on UP, we have to patch these instructions away.
   */
-#define ALT_SMP(smp, up)                                       \
-       "9998:  " smp "\n"                                      \
-       "       .pushsection \".alt.smp.init\", \"a\"\n"        \
-       "       .long   9998b\n"                                \
-       "       " up "\n"                                       \
-       "       .popsection\n"
-
  #ifdef CONFIG_THUMB2_KERNEL
-#define SEV            ALT_SMP("sev.w", "nop.w")
  /*
   * For Thumb-2, special care is needed to ensure that the conditional WFE
   * instruction really does assemble to exactly 4 bytes (as required by
@@ -31,17 +23,18 @@
   * the assembler won't change IT instructions which are explicitly present
   * in the input.
   */
-#define WFE(cond)      ALT_SMP(                \
+#define WFE(cond)      __ALT_SMP_ASM(          \
         "it " cond "\n\t"                       \
         "wfe" cond ".n",                        \
                                                 \
         "nop.w"                                 \
  )
  #else
-#define SEV            ALT_SMP("sev", "nop")
-#define WFE(cond)      ALT_SMP("wfe" cond, "nop")
+#define WFE(cond)      __ALT_SMP_ASM("wfe" cond, "nop")
  #endif
  
+#define SEV            __ALT_SMP_ASM(WASM(sev), WASM(nop))
+
  static inline void dsb_sev(void)
  {
  #if __LINUX_ARM_ARCH__ >= 7
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
         u32 newval;
         arch_spinlock_t lockval;
  
+       prefetchw(&lock->slock);
         __asm__ __volatile__(
  "1:    ldrex   %0, [%3]\n"
  "      add     %1, %0, %4\n"
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
         unsigned long contended, res;
         u32 slock;
  
+       prefetchw(&lock->slock);
         do {
                 __asm__ __volatile__(
                 "       ldrex   %0, [%3]\n"
@@ -127,10 +122,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
         dsb_sev();
  }
  
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+       return lock.tickets.owner == lock.tickets.next;
+}
+
  static inline int arch_spin_is_locked(arch_spinlock_t *lock)
  {
-       struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
-       return tickets.owner != tickets.next;
+       return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
  }
  
  static inline int arch_spin_is_contended(arch_spinlock_t *lock)
@@ -152,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
  {
         unsigned long tmp;
  
+       prefetchw(&rw->lock);
         __asm__ __volatile__(
  "1:    ldrex   %0, [%1]\n"
  "      teq     %0, #0\n"
@@ -170,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
  {
         unsigned long contended, res;
  
+       prefetchw(&rw->lock);
         do {
                 __asm__ __volatile__(
                 "       ldrex   %0, [%2]\n"
@@ -203,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
  }
  
  /* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x)         ((x)->lock == 0)
+#define arch_write_can_lock(x)         (ACCESS_ONCE((x)->lock) == 0)
  
  /*
   * Read locks are a bit more hairy:
@@ -221,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
  {
         unsigned long tmp, tmp2;
  
+       prefetchw(&rw->lock);
         __asm__ __volatile__(
  "1:    ldrex   %0, [%2]\n"
  "      adds    %0, %0, #1\n"
@@ -241,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
  
         smp_mb();
  
+       prefetchw(&rw->lock);
         __asm__ __volatile__(
  "1:    ldrex   %0, [%2]\n"
  "      sub     %0, %0, #1\n"
@@ -259,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
  {
         unsigned long contended, res;
  
+       prefetchw(&rw->lock);
         do {
                 __asm__ __volatile__(
                 "       ldrex   %0, [%2]\n"
@@ -280,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
  }
  
  /* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x)          ((x)->lock < 0x80000000)
+#define arch_read_can_lock(x)          (ACCESS_ONCE((x)->lock) < 0x80000000)
  
  #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
  #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h

index b262d2f8b4784eba5b6805d431468c285c434b88..47663fcb10ad7aad7e3bc87f31636a7a77342e36 100644 (file)
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -25,7 +25,7 @@ typedef struct {
  #define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
  
  typedef struct {
-       volatile unsigned int lock;
+       u32 lock;
  } arch_rwlock_t;
  
  #define __ARCH_RW_LOCK_UNLOCKED                { 0 }
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h

index 38960264040cd989068b7e897979194bd5d30bc4..def9e570199f90a0c42dc7da0f8998fba6a0ab39 100644 (file)
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -560,37 +560,6 @@ static inline void __flush_bp_all(void)
                 asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
  }
  
-#include <asm/cputype.h>
-#ifdef CONFIG_ARM_ERRATA_798181
-static inline int erratum_a15_798181(void)
-{
-       unsigned int midr = read_cpuid_id();
-
-       /* Cortex-A15 r0p0..r3p2 affected */
-       if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2)
-               return 0;
-       return 1;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
-       /*
-        * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0.
-        */
-       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
-       dsb(ish);
-}
-#else
-static inline int erratum_a15_798181(void)
-{
-       return 0;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
-}
-#endif
-
  /*
   *     flush_pmd_entry
   *
@@ -697,4 +666,21 @@ extern void flush_bp_all(void);
  
  #endif
  
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_ARM_ERRATA_798181
+extern void erratum_a15_798181_init(void);
+#else
+static inline void erratum_a15_798181_init(void) {}
+#endif
+extern bool (*erratum_a15_798181_handler)(void);
+
+static inline bool erratum_a15_798181(void)
+{
+       if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) &&
+               erratum_a15_798181_handler))
+               return erratum_a15_798181_handler();
+       return false;
+}
+#endif
+
  #endif
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h

index f5989f46b4d2d450f18b24faa946de750394ec13..b88beaba6b4a5cf9c0ccded73723c4ea41539167 100644 (file)
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -38,6 +38,8 @@
  #ifdef __ASSEMBLY__
  #define W(instr)       instr.w
  #define BSYM(sym)      sym + 1
+#else
+#define WASM(instr)    #instr ".w"
  #endif
  
  #else  /* !CONFIG_THUMB2_KERNEL */
@@ -50,6 +52,8 @@
  #ifdef __ASSEMBLY__
  #define W(instr)       instr
  #define BSYM(sym)      sym
+#else
+#define WASM(instr)    #instr
  #endif
  
  #endif /* CONFIG_THUMB2_KERNEL */
diff --git a/arch/arm/include/debug/efm32.S b/arch/arm/include/debug/efm32.S

new file mode 100644 (file)

index 0000000..2265a19
--- /dev/null
+++ b/arch/arm/include/debug/efm32.S
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2013 Pengutronix
+ * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define UARTn_CMD              0x000c
+#define UARTn_CMD_TXEN                 0x0004
+
+#define        UARTn_STATUS            0x0010
+#define        UARTn_STATUS_TXC                0x0020
+#define        UARTn_STATUS_TXBL               0x0040
+
+#define        UARTn_TXDATA            0x0034
+
+               .macro  addruart, rx, tmp
+               ldr     \rx, =(CONFIG_DEBUG_UART_PHYS)
+
+               /*
+                * enable TX. The driver might disable it to save energy. We
+                * don't care about disabling at the end as during debug power
+                * consumption isn't that important.
+                */
+               ldr     \tmp, =(UARTn_CMD_TXEN)
+               str     \tmp, [\rx, #UARTn_CMD]
+               .endm
+
+               .macro  senduart,rd,rx
+               strb    \rd, [\rx, #UARTn_TXDATA]
+               .endm
+
+               .macro  waituart,rd,rx
+1001:          ldr     \rd, [\rx, #UARTn_STATUS]
+               tst     \rd, #UARTn_STATUS_TXBL
+               beq     1001b
+               .endm
+
+               .macro  busyuart,rd,rx
+1001:          ldr     \rd, [\rx, UARTn_STATUS]
+               tst     \rd, #UARTn_STATUS_TXC
+               bne     1001b
+               .endm
diff --git a/arch/arm/include/debug/msm.S b/arch/arm/include/debug/msm.S

index 9166e1bc470e0153107f059ddd5fbec2570fb5a6..9d653d475903b56ea14e3e7c568319c963d9ceca 100644 (file)
--- a/arch/arm/include/debug/msm.S
+++ b/arch/arm/include/debug/msm.S
@@ -44,6 +44,11 @@
  #ifdef CONFIG_DEBUG_MSM8960_UART
  #define MSM_DEBUG_UART_BASE    0xF0040000
  #define MSM_DEBUG_UART_PHYS    0x16440000
+#endif
+
+#ifdef CONFIG_DEBUG_MSM8974_UART
+#define MSM_DEBUG_UART_BASE    0xFA71E000
+#define MSM_DEBUG_UART_PHYS    0xF991E000
  #endif
  
         .macro  addruart, rp, rv, tmp
diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S

index 37c6895b87e6d72ce5837e57a4bae0a367339cb2..92ef808a23377275124a59550b6ab86f3e163486 100644 (file)
--- a/arch/arm/include/debug/pl01x.S
+++ b/arch/arm/include/debug/pl01x.S
@@ -25,12 +25,14 @@
  
                 .macro  waituart,rd,rx
  1001:          ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(      rev     \rd, \rd )
                 tst     \rd, #UART01x_FR_TXFF
                 bne     1001b
                 .endm
  
                 .macro  busyuart,rd,rx
  1001:          ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(      rev     \rd, \rd )
                 tst     \rd, #UART01x_FR_BUSY
                 bne     1001b
                 .endm
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild

index 18d76fd5a2afb2bf27b91980f29c77094e6638c0..70a1c9da30ca39d4d4d79e8e73c3b8aec0d607e3 100644 (file)
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += hwcap.h
  header-y += ioctls.h
  header-y += kvm_para.h
  header-y += mman.h
+header-y += perf_regs.h
  header-y += posix_types.h
  header-y += ptrace.h
  header-y += setup.h
diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h

new file mode 100644 (file)

index 0000000..ce59448
--- /dev/null
+++ b/arch/arm/include/uapi/asm/perf_regs.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_ARM_PERF_REGS_H
+#define _ASM_ARM_PERF_REGS_H
+
+enum perf_event_arm_regs {
+       PERF_REG_ARM_R0,
+       PERF_REG_ARM_R1,
+       PERF_REG_ARM_R2,
+       PERF_REG_ARM_R3,
+       PERF_REG_ARM_R4,
+       PERF_REG_ARM_R5,
+       PERF_REG_ARM_R6,
+       PERF_REG_ARM_R7,
+       PERF_REG_ARM_R8,
+       PERF_REG_ARM_R9,
+       PERF_REG_ARM_R10,
+       PERF_REG_ARM_FP,
+       PERF_REG_ARM_IP,
+       PERF_REG_ARM_SP,
+       PERF_REG_ARM_LR,
+       PERF_REG_ARM_PC,
+       PERF_REG_ARM_MAX,
+};
+#endif /* _ASM_ARM_PERF_REGS_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile

index 5140df5f23aa485214914a8dfbfdf31dc04a5691..a30fc9be9e9e6abc8ccbedb17fb61bddba71bcd7 100644 (file)
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
  
  obj-y          := elf.o entry-common.o irq.o opcodes.o \
                    process.o ptrace.o return_address.o \
-                  setup.o signal.o stacktrace.o sys_arm.o time.o traps.o
+                  setup.o signal.o sigreturn_codes.o \
+                  stacktrace.o sys_arm.o time.o traps.o
  
  obj-$(CONFIG_ATAGS)            += atags_parse.o
  obj-$(CONFIG_ATAGS_PROC)       += atags_proc.o
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3)                += xscale-cp0.o
  obj-$(CONFIG_CPU_MOHAWK)       += xscale-cp0.o
  obj-$(CONFIG_CPU_PJ4)          += pj4-cp0.o
  obj-$(CONFIG_IWMMXT)           += iwmmxt.o
+obj-$(CONFIG_PERF_EVENTS)      += perf_regs.o
  obj-$(CONFIG_HW_PERF_EVENTS)   += perf_event.o perf_event_cpu.o
  AFLAGS_iwmmxt.o                        := -Wa,-mcpu=iwmmxt
  obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c

index 60d3b738d4200987e76c75b2e9da513920087a89..1f031ddd0667a3e842317a90c59db3acc2284894 100644 (file)
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
  
  #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
  EXPORT_SYMBOL(__pv_phys_offset);
+EXPORT_SYMBOL(__pv_offset);
  #endif
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S

index 9cbe70c8b0ef7b8d16a806602608fba205966d31..b3fb8c9e1ff2d75f15666a9233dee2d1b3ccb6bc 100644 (file)
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -192,6 +192,7 @@ __dabt_svc:
         svc_entry
         mov     r2, sp
         dabt_helper
+ THUMB(        ldr     r5, [sp, #S_PSR]        )       @ potentially updated CPSR
         svc_exit r5                             @ return from exception
   UNWIND(.fnend         )
  ENDPROC(__dabt_svc)
@@ -416,9 +417,8 @@ __und_usr:
         bne     __und_usr_thumb
         sub     r4, r2, #4                      @ ARM instr at LR - 4
  1:     ldrt    r0, [r4]
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r0, r0                          @ little endian instruction
-#endif
+ ARM_BE8(rev   r0, r0)                         @ little endian instruction
+
         @ r0 = 32-bit ARM instruction which caused the exception
         @ r2 = PC value for the following instruction (:= regs->ARM_pc)
         @ r4 = PC value for the faulting instruction
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S

index bc6bd9683ba4555d9713e1693b258e6204fc90a5..a2dcafdf1bc89a176f80dda748625a615717979a 100644 (file)
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -393,9 +393,7 @@ ENTRY(vector_swi)
  #else
   USER( ldr     r10, [lr, #-4]          )       @ get SWI instruction
  #endif
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r10, r10                        @ little endian instruction
-#endif
+ ARM_BE8(rev   r10, r10)                       @ little endian instruction
  
  #elif defined(CONFIG_AEABI)
  
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S

index 476de57dcef284602e126e29e2da03465df8ff09..7801866e626a2a1a4631d9e3e3fbd3c27ddda429 100644 (file)
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -77,6 +77,7 @@
  
         __HEAD
  ENTRY(stext)
+ ARM_BE8(setend        be )                    @ ensure we are in BE8 mode
  
   THUMB(        adr     r9, BSYM(1f)    )       @ Kernel is always entered in ARM.
   THUMB(        bx      r9              )       @ If this is a Thumb-2 kernel,
@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
          * the processor type - there is no need to check the machine type
          * as it has already been validated by the primary processor.
          */
+
+ ARM_BE8(setend        be)                             @ ensure we are in BE8 mode
+
  #ifdef CONFIG_ARM_VIRT_EXT
         bl      __hyp_stub_install_secondary
  #endif
@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
         ldmfd   sp!, {r4 - r6, pc}
  ENDPROC(fixup_smp)
  
+#ifdef __ARMEB__
+#define LOW_OFFSET     0x4
+#define HIGH_OFFSET    0x0
+#else
+#define LOW_OFFSET     0x0
+#define HIGH_OFFSET    0x4
+#endif
+
  #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
  
  /* __fixup_pv_table - patch the stub instructions with the delta between
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
         __HEAD
  __fixup_pv_table:
         adr     r0, 1f
-       ldmia   r0, {r3-r5, r7}
-       sub     r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
+       ldmia   r0, {r3-r7}
+       mvn     ip, #0
+       subs    r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
         add     r4, r4, r3      @ adjust table start address
         add     r5, r5, r3      @ adjust table end address
-       add     r7, r7, r3      @ adjust __pv_phys_offset address
-       str     r8, [r7]        @ save computed PHYS_OFFSET to __pv_phys_offset
+       add     r6, r6, r3      @ adjust __pv_phys_offset address
+       add     r7, r7, r3      @ adjust __pv_offset address
+       str     r8, [r6, #LOW_OFFSET]   @ save computed PHYS_OFFSET to __pv_phys_offset
+       strcc   ip, [r7, #HIGH_OFFSET]  @ save to __pv_offset high bits
         mov     r6, r3, lsr #24 @ constant for add/sub instructions
         teq     r3, r6, lsl #24 @ must be 16MiB aligned
  THUMB( it      ne              @ cross section branch )
         bne     __error
-       str     r6, [r7, #4]    @ save to __pv_offset
+       str     r3, [r7, #LOW_OFFSET]   @ save to __pv_offset low bits
         b       __fixup_a_pv_table
  ENDPROC(__fixup_pv_table)
  
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
         .long   __pv_table_begin
         .long   __pv_table_end
  2:     .long   __pv_phys_offset
+       .long   __pv_offset
  
         .text
  __fixup_a_pv_table:
+       adr     r0, 3f
+       ldr     r6, [r0]
+       add     r6, r6, r3
+       ldr     r0, [r6, #HIGH_OFFSET]  @ pv_offset high word
+       ldr     r6, [r6, #LOW_OFFSET]   @ pv_offset low word
+       mov     r6, r6, lsr #24
+       cmn     r0, #1
  #ifdef CONFIG_THUMB2_KERNEL
+       moveq   r0, #0x200000   @ set bit 21, mov to mvn instruction
         lsls    r6, #24
         beq     2f
         clz     r7, r6
@@ -601,18 +625,42 @@ __fixup_a_pv_table:
         b       2f
  1:     add     r7, r3
         ldrh    ip, [r7, #2]
-       and     ip, 0x8f00
-       orr     ip, r6  @ mask in offset bits 31-24
+ARM_BE8(rev16  ip, ip)
+       tst     ip, #0x4000
+       and     ip, #0x8f00
+       orrne   ip, r6  @ mask in offset bits 31-24
+       orreq   ip, r0  @ mask in offset bits 7-0
+ARM_BE8(rev16  ip, ip)
         strh    ip, [r7, #2]
+       bne     2f
+       ldrh    ip, [r7]
+ARM_BE8(rev16  ip, ip)
+       bic     ip, #0x20
+       orr     ip, ip, r0, lsr #16
+ARM_BE8(rev16  ip, ip)
+       strh    ip, [r7]
  2:     cmp     r4, r5
         ldrcc   r7, [r4], #4    @ use branch for delay slot
         bcc     1b
         bx      lr
  #else
+       moveq   r0, #0x400000   @ set bit 22, mov to mvn instruction
         b       2f
  1:     ldr     ip, [r7, r3]
+#ifdef CONFIG_CPU_ENDIAN_BE8
+       @ in BE8, we load data in BE, but instructions still in LE
+       bic     ip, ip, #0xff000000
+       tst     ip, #0x000f0000 @ check the rotation field
+       orrne   ip, ip, r6, lsl #24 @ mask in offset bits 31-24
+       biceq   ip, ip, #0x00004000 @ clear bit 22
+       orreq   ip, ip, r0, lsl #24 @ mask in offset bits 7-0
+#else
         bic     ip, ip, #0x000000ff
-       orr     ip, ip, r6      @ mask in offset bits 31-24
+       tst     ip, #0xf00      @ check the rotation field
+       orrne   ip, ip, r6      @ mask in offset bits 31-24
+       biceq   ip, ip, #0x400000       @ clear bit 22
+       orreq   ip, ip, r0      @ mask in offset bits 7-0
+#endif
         str     ip, [r7, r3]
  2:     cmp     r4, r5
         ldrcc   r7, [r4], #4    @ use branch for delay slot
@@ -621,28 +669,30 @@ __fixup_a_pv_table:
  #endif
  ENDPROC(__fixup_a_pv_table)
  
+       .align
+3:     .long __pv_offset
+
  ENTRY(fixup_pv_table)
         stmfd   sp!, {r4 - r7, lr}
-       ldr     r2, 2f                  @ get address of __pv_phys_offset
         mov     r3, #0                  @ no offset
         mov     r4, r0                  @ r0 = table start
         add     r5, r0, r1              @ r1 = table size
-       ldr     r6, [r2, #4]            @ get __pv_offset
         bl      __fixup_a_pv_table
         ldmfd   sp!, {r4 - r7, pc}
  ENDPROC(fixup_pv_table)
  
-       .align
-2:     .long   __pv_phys_offset
-
         .data
         .globl  __pv_phys_offset
         .type   __pv_phys_offset, %object
  __pv_phys_offset:
-       .long   0
-       .size   __pv_phys_offset, . - __pv_phys_offset
+       .quad   0
+       .size   __pv_phys_offset, . -__pv_phys_offset
+
+       .globl  __pv_offset
+       .type   __pv_offset, %object
  __pv_offset:
-       .long   0
+       .quad   0
+       .size   __pv_offset, . -__pv_offset
  #endif
  
  #include "head-common.S"
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c

index 7b95de6013571df3528716f2bba00f8dfd13badf..3d446605cbf84b89890bdf5bb2398a64a8401120 100644 (file)
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
                 /* Breakpoint */
                 ctrl_base = ARM_BASE_BCR;
                 val_base = ARM_BASE_BVR;
-               slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                 max_slots = core_num_brps;
         } else {
                 /* Watchpoint */
                 ctrl_base = ARM_BASE_WCR;
                 val_base = ARM_BASE_WVR;
-               slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                 max_slots = core_num_wrps;
         }
  
@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
         if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
                 /* Breakpoint */
                 base = ARM_BASE_BCR;
-               slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                 max_slots = core_num_brps;
         } else {
                 /* Watchpoint */
                 base = ARM_BASE_WCR;
-               slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                 max_slots = core_num_wrps;
         }
  
@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
         struct arch_hw_breakpoint *info;
         struct arch_hw_breakpoint_ctrl ctrl;
  
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
  
         for (i = 0; i < core_num_wrps; ++i) {
                 rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc)
         struct perf_event *wp, **slots;
         struct arch_hw_breakpoint *info;
  
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
  
         for (i = 0; i < core_num_wrps; ++i) {
                 rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
         struct arch_hw_breakpoint *info;
         struct arch_hw_breakpoint_ctrl ctrl;
  
-       slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+       slots = this_cpu_ptr(bp_on_reg);
  
         /* The exception entry code places the amended lr in the PC. */
         addr = regs->ARM_pc;
diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c

index 170e9f34003f414030c6070e8c913df21c5091cd..a7b621ece23d3c729d681e8004952c320215e9fc 100644 (file)
--- a/arch/arm/kernel/kprobes.c
+++ b/arch/arm/kernel/kprobes.c
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
  
  static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
  {
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+       __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
         kcb->kprobe_status = kcb->prev_kprobe.status;
  }
  
  static void __kprobes set_current_kprobe(struct kprobe *p)
  {
-       __get_cpu_var(current_kprobe) = p;
+       __this_cpu_write(current_kprobe, p);
  }
  
  static void __kprobes
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                         continue;
  
                 if (ri->rp && ri->rp->handler) {
-                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                       __this_cpu_write(current_kprobe, &ri->rp->kp);
                         get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
                         ri->rp->handler(ri, regs);
-                       __get_cpu_var(current_kprobe) = NULL;
+                       __this_cpu_write(current_kprobe, NULL);
                 }
  
                 orig_ret_address = (unsigned long)ri->ret_addr;
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c

index c9dfff3b80082e60fa2244efd1a23216293356ec..45e478157278e331ac6474ca5dbac859415b0fff 100644 (file)
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -24,6 +24,7 @@
  #include <asm/sections.h>
  #include <asm/smp_plat.h>
  #include <asm/unwind.h>
+#include <asm/opcodes.h>
  
  #ifdef CONFIG_XIP_KERNEL
  /*
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                 Elf32_Sym *sym;
                 const char *symname;
                 s32 offset;
+               u32 tmp;
  #ifdef CONFIG_THUMB2_KERNEL
                 u32 upper, lower, sign, j1, j2;
  #endif
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                 case R_ARM_PC24:
                 case R_ARM_CALL:
                 case R_ARM_JUMP24:
-                       offset = (*(u32 *)loc & 0x00ffffff) << 2;
+                       offset = __mem_to_opcode_arm(*(u32 *)loc);
+                       offset = (offset & 0x00ffffff) << 2;
                         if (offset & 0x02000000)
                                 offset -= 0x04000000;
  
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                         }
  
                         offset >>= 2;
+                       offset &= 0x00ffffff;
  
-                       *(u32 *)loc &= 0xff000000;
-                       *(u32 *)loc |= offset & 0x00ffffff;
+                       *(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
+                       *(u32 *)loc |= __opcode_to_mem_arm(offset);
                         break;
  
                case R_ARM_V4BX:
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                         * other bits to re-code instruction as
                         * MOV PC,Rm.
                         */
-                      *(u32 *)loc &= 0xf000000f;
-                      *(u32 *)loc |= 0x01a0f000;
+                      *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
+                      *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
                        break;
  
                 case R_ARM_PREL31:
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
  
                 case R_ARM_MOVW_ABS_NC:
                 case R_ARM_MOVT_ABS:
-                       offset = *(u32 *)loc;
+                       offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
                         offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
                         offset = (offset ^ 0x8000) - 0x8000;
  
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                         if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
                                 offset >>= 16;
  
-                       *(u32 *)loc &= 0xfff0f000;
-                       *(u32 *)loc |= ((offset & 0xf000) << 4) |
-                                       (offset & 0x0fff);
+                       tmp &= 0xfff0f000;
+                       tmp |= ((offset & 0xf000) << 4) |
+                               (offset & 0x0fff);
+
+                       *(u32 *)loc = __opcode_to_mem_arm(tmp);
                         break;
  
  #ifdef CONFIG_THUMB2_KERNEL
                 case R_ARM_THM_CALL:
                 case R_ARM_THM_JUMP24:
-                       upper = *(u16 *)loc;
-                       lower = *(u16 *)(loc + 2);
+                       upper = __mem_to_opcode_thumb16(*(u16 *)loc);
+                       lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
  
                         /*
                          * 25 bit signed address range (Thumb-2 BL and B.W
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                         sign = (offset >> 24) & 1;
                         j1 = sign ^ (~(offset >> 23) & 1);
                         j2 = sign ^ (~(offset >> 22) & 1);
-                       *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) |
+                       upper = (u16)((upper & 0xf800) | (sign << 10) |
                                             ((offset >> 12) & 0x03ff));
-                       *(u16 *)(loc + 2) = (u16)((lower & 0xd000) |
-                                                 (j1 << 13) | (j2 << 11) |
-                                                 ((offset >> 1) & 0x07ff));
+                       lower = (u16)((lower & 0xd000) |
+                                     (j1 << 13) | (j2 << 11) |
+                                     ((offset >> 1) & 0x07ff));
+
+                       *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                       *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                         break;
  
                 case R_ARM_THM_MOVW_ABS_NC:
                 case R_ARM_THM_MOVT_ABS:
-                       upper = *(u16 *)loc;
-                       lower = *(u16 *)(loc + 2);
+                       upper = __mem_to_opcode_thumb16(*(u16 *)loc);
+                       lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
  
                         /*
                          * MOVT/MOVW instructions encoding in Thumb-2:
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                         if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
                                 offset >>= 16;
  
-                       *(u16 *)loc = (u16)((upper & 0xfbf0) |
-                                           ((offset & 0xf000) >> 12) |
-                                           ((offset & 0x0800) >> 1));
-                       *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) |
-                                                 ((offset & 0x0700) << 4) |
-                                                 (offset & 0x00ff));
+                       upper = (u16)((upper & 0xfbf0) |
+                                     ((offset & 0xf000) >> 12) |
+                                     ((offset & 0x0800) >> 1));
+                       lower = (u16)((lower & 0x8f00) |
+                                     ((offset & 0x0700) << 4) |
+                                     (offset & 0x00ff));
+                       *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                       *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                         break;
  #endif
  
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c

index e186ee1e63f6c85261f96844a594080e719c2e07..bc3f2efa0d86b4ff55d6b19833eae688b111fd27 100644 (file)
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
                struct perf_event *event)
  {
         struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-       struct pmu *leader_pmu = event->group_leader->pmu;
  
         if (is_software_event(event))
                 return 1;
  
-       if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
+       if (event->state < PERF_EVENT_STATE_OFF)
                 return 1;
  
         if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c

index 8d6147b2001f82eca02c738265f9477dc2dbaefa..d85055cd24bacc55cde9e99f1547afbfe33317ae 100644 (file)
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);
  
  static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
  {
-       return &__get_cpu_var(cpu_hw_events);
+       return this_cpu_ptr(&cpu_hw_events);
  }
  
  static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c

new file mode 100644 (file)

index 0000000..6e4379c
--- /dev/null
+++ b/arch/arm/kernel/perf_regs.c
@@ -0,0 +1,30 @@
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+       if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
+               return 0;
+
+       return regs->uregs[idx];
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+       if (!mask || mask & REG_RESERVED)
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+       return PERF_SAMPLE_REGS_ABI_32;
+}
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 5d65438685d8516f6455393940e4bdaed107699a..6a1b8a81b1ae448168572a9558aaf026f9e7e47e 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
  #endif
  
  extern void paging_init(const struct machine_desc *desc);
+extern void early_paging_init(const struct machine_desc *,
+                             struct proc_info_list *);
  extern void sanity_check_meminfo(void);
  extern enum reboot_mode reboot_mode;
  extern void setup_dma_zone(const struct machine_desc *desc);
@@ -599,6 +601,8 @@ static void __init setup_processor(void)
         elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
  #endif
  
+       erratum_a15_798181_init();
+
         feat_v6_fixup();
  
         cacheid_init();
@@ -619,9 +623,10 @@ void __init dump_machine_table(void)
                 /* can't use cpu_relax() here as it may require MMU setup */;
  }
  
-int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
+int __init arm_add_memory(u64 start, u64 size)
  {
         struct membank *bank = &meminfo.bank[meminfo.nr_banks];
+       u64 aligned_start;
  
         if (meminfo.nr_banks >= NR_BANKS) {
                 printk(KERN_CRIT "NR_BANKS too low, "
@@ -634,10 +639,16 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
          * Size is appropriately rounded down, start is rounded up.
          */
         size -= start & ~PAGE_MASK;
-       bank->start = PAGE_ALIGN(start);
+       aligned_start = PAGE_ALIGN(start);
  
-#ifndef CONFIG_ARM_LPAE
-       if (bank->start + size < bank->start) {
+#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
+       if (aligned_start > ULONG_MAX) {
+               printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
+                      "32-bit physical address space\n", (long long)start);
+               return -EINVAL;
+       }
+
+       if (aligned_start + size > ULONG_MAX) {
                 printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
                         "32-bit physical address space\n", (long long)start);
                 /*
@@ -645,10 +656,11 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
                  * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
                  * This means we lose a page after masking.
                  */
-               size = ULONG_MAX - bank->start;
+               size = ULONG_MAX - aligned_start;
         }
  #endif
  
+       bank->start = aligned_start;
         bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);
  
         /*
@@ -669,8 +681,8 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
  static int __init early_mem(char *p)
  {
         static int usermem __initdata = 0;
-       phys_addr_t size;
-       phys_addr_t start;
+       u64 size;
+       u64 start;
         char *endp;
  
         /*
@@ -878,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
         parse_early_param();
  
         sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
+
+       early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
         sanity_check_meminfo();
         arm_memblock_init(&meminfo, mdesc);
  
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c

index ab330422527203417472a21727b1062c8a38e70f..04d63880037f9c293741230f95e9520c19ee3dee 100644 (file)
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -21,29 +21,7 @@
  #include <asm/unistd.h>
  #include <asm/vfp.h>
  
-/*
- * For ARM syscalls, we encode the syscall number into the instruction.
- */
-#define SWI_SYS_SIGRETURN      (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-#define SWI_SYS_RT_SIGRETURN   (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-
-/*
- * With EABI, the syscall number has to be loaded into r7.
- */
-#define MOV_R7_NR_SIGRETURN    (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-
-/*
- * For Thumb syscalls, we pass the syscall number via r7.  We therefore
- * need two 16-bit instructions.
- */
-#define SWI_THUMB_SIGRETURN    (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-
-static const unsigned long sigreturn_codes[7] = {
-       MOV_R7_NR_SIGRETURN,    SWI_SYS_SIGRETURN,    SWI_THUMB_SIGRETURN,
-       MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
-};
+extern const unsigned long sigreturn_codes[7];
  
  static unsigned long signal_return_offset;
  
@@ -375,12 +353,18 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
                  */
                 thumb = handler & 1;
  
-               if (thumb) {
-                       cpsr |= PSR_T_BIT;
  #if __LINUX_ARM_ARCH__ >= 7
-                       /* clear the If-Then Thumb-2 execution state */
-                       cpsr &= ~PSR_IT_MASK;
+               /*
+                * Clear the If-Then Thumb-2 execution state
+                * ARM spec requires this to be all 000s in ARM mode
+                * Snapdragon S4/Krait misbehaves on a Thumb=>ARM
+                * signal transition without this.
+                */
+               cpsr &= ~PSR_IT_MASK;
  #endif
+
+               if (thumb) {
+                       cpsr |= PSR_T_BIT;
                 } else
                         cpsr &= ~PSR_T_BIT;
         }
diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S

new file mode 100644 (file)

index 0000000..3c5d0f2
--- /dev/null
+++ b/arch/arm/kernel/sigreturn_codes.S
@@ -0,0 +1,80 @@
+/*
+ * sigreturn_codes.S - code sinpets for sigreturn syscalls
+ *
+ * Created by: Victor Kamensky, 2013-08-13
+ * Copyright:  (C) 2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <asm/unistd.h>
+
+/*
+ * For ARM syscalls, we encode the syscall number into the instruction.
+ * With EABI, the syscall number has to be loaded into r7. As result
+ * ARM syscall sequence snippet will have move and svc in .arm encoding
+ *
+ * For Thumb syscalls, we pass the syscall number via r7.  We therefore
+ * need two 16-bit instructions in .thumb encoding
+ *
+ * Please note sigreturn_codes code are not executed in place. Instead
+ * they just copied by kernel into appropriate places. Code inside of
+ * arch/arm/kernel/signal.c is very sensitive to layout of these code
+ * snippets.
+ */
+
+#if __LINUX_ARM_ARCH__ <= 4
+       /*
+        * Note we manually set minimally required arch that supports
+        * required thumb opcodes for early arch versions. It is OK
+        * for this file to be used in combination with other
+        * lower arch variants, since these code snippets are only
+        * used as input data.
+        */
+       .arch armv4t
+#endif
+
+       .section .rodata
+       .global sigreturn_codes
+       .type   sigreturn_codes, #object
+
+       .arm
+
+sigreturn_codes:
+
+       /* ARM sigreturn syscall code snippet */
+       mov     r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+       swi     #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+
+       /* Thumb sigreturn syscall code snippet */
+       .thumb
+       movs    r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+       swi     #0
+
+       /* ARM sigreturn_rt syscall code snippet */
+       .arm
+       mov     r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+       swi     #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+
+       /* Thumb sigreturn_rt syscall code snippet */
+       .thumb
+       movs    r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+       swi     #0
+
+       /*
+        * Note on addtional space: setup_return in signal.c
+        * algorithm uses two words copy regardless whether
+        * it is thumb case or not, so we need additional
+        * word after real last entry.
+        */
+       .arm
+       .space  4
+
+       .size   sigreturn_codes, . - sigreturn_codes
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S

index db1536b8b30b497fe11923dfe9f67a72f86f7954..b907d9b790ab7234171c713c4e6c9aed957868ea 100644 (file)
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -55,6 +55,7 @@
   * specific registers and some other data for resume.
   *  r0 = suspend function arg0
   *  r1 = suspend function
+ *  r2 = MPIDR value the resuming CPU will use
   */
  ENTRY(__cpu_suspend)
         stmfd   sp!, {r4 - r11, lr}
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
         mov     r5, sp                  @ current virtual SP
         add     r4, r4, #12             @ Space for pgd, virt sp, phys resume fn
         sub     sp, sp, r4              @ allocate CPU state on stack
-       stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
-       add     r0, sp, #8              @ save pointer to save block
-       mov     r1, r4                  @ size of save block
-       mov     r2, r5                  @ virtual SP
         ldr     r3, =sleep_save_sp
+       stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
         ldr     r3, [r3, #SLEEP_SAVE_SP_VIRT]
-       ALT_SMP(mrc p15, 0, r9, c0, c0, 5)
-        ALT_UP_B(1f)
-       ldr     r8, =mpidr_hash
-       /*
-        * This ldmia relies on the memory layout of the mpidr_hash
-        * struct mpidr_hash.
-        */
-       ldmia   r8, {r4-r7}     @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts
-       compute_mpidr_hash      lr, r5, r6, r7, r9, r4
-       add     r3, r3, lr, lsl #2
-1:
+       ALT_SMP(ldr r0, =mpidr_hash)
+       ALT_UP_B(1f)
+       /* This ldmia relies on the memory layout of the mpidr_hash struct */
+       ldmia   r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
+       compute_mpidr_hash      r0, r6, r7, r8, r2, r1
+       add     r3, r3, r0, lsl #2
+1:     mov     r2, r5                  @ virtual SP
+       mov     r1, r4                  @ size of save block
+       add     r0, sp, #8              @ pointer to save block
         bl      __cpu_suspend_save
         adr     lr, BSYM(cpu_suspend_abort)
         ldmfd   sp!, {r0, pc}           @ call suspend fn
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
         .data
         .align
  ENTRY(cpu_resume)
+ARM_BE8(setend be)                     @ ensure we are in BE mode
         mov     r1, #0
         ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
         ALT_UP_B(1f)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c

index 72024ea8a3a6c07038103e153527cb2454e4552f..dc894ab3622b1effac1b885bd919bf629f66dc5d 100644 (file)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -25,6 +25,7 @@
  #include <linux/clockchips.h>
  #include <linux/completion.h>
  #include <linux/cpufreq.h>
+#include <linux/irq_work.h>
  
  #include <linux/atomic.h>
  #include <asm/smp.h>
@@ -66,6 +67,8 @@ enum ipi_msg_type {
         IPI_CALL_FUNC,
         IPI_CALL_FUNC_SINGLE,
         IPI_CPU_STOP,
+       IPI_IRQ_WORK,
+       IPI_COMPLETION,
  };
  
  static DECLARE_COMPLETION(cpu_running);
@@ -80,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
  
  static unsigned long get_arch_pgd(pgd_t *pgd)
  {
-       phys_addr_t pgdir = virt_to_phys(pgd);
+       phys_addr_t pgdir = virt_to_idmap(pgd);
         BUG_ON(pgdir & ARCH_PGD_MASK);
         return pgdir >> ARCH_PGD_SHIFT;
  }
@@ -448,6 +451,14 @@ void arch_send_call_function_single_ipi(int cpu)
         smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
  }
  
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
+{
+       if (is_smp())
+               smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
+}
+#endif
+
  static const char *ipi_types[NR_IPI] = {
  #define S(x,s) [x] = s
         S(IPI_WAKEUP, "CPU wakeup interrupts"),
@@ -456,6 +467,8 @@ static const char *ipi_types[NR_IPI] = {
         S(IPI_CALL_FUNC, "Function call interrupts"),
         S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
         S(IPI_CPU_STOP, "CPU stop interrupts"),
+       S(IPI_IRQ_WORK, "IRQ work interrupts"),
+       S(IPI_COMPLETION, "completion interrupts"),
  };
  
  void show_ipi_list(struct seq_file *p, int prec)
@@ -515,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
                 cpu_relax();
  }
  
+static DEFINE_PER_CPU(struct completion *, cpu_completion);
+
+int register_ipi_completion(struct completion *completion, int cpu)
+{
+       per_cpu(cpu_completion, cpu) = completion;
+       return IPI_COMPLETION;
+}
+
+static void ipi_complete(unsigned int cpu)
+{
+       complete(per_cpu(cpu_completion, cpu));
+}
+
  /*
   * Main handler for inter-processor interrupts
   */
@@ -565,6 +591,20 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                 irq_exit();
                 break;
  
+#ifdef CONFIG_IRQ_WORK
+       case IPI_IRQ_WORK:
+               irq_enter();
+               irq_work_run();
+               irq_exit();
+               break;
+#endif
+
+       case IPI_COMPLETION:
+               irq_enter();
+               ipi_complete(cpu);
+               irq_exit();
+               break;
+
         default:
                 printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
                        cpu, ipinr);
diff --git a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c

index 5bc1a63284e3913a19afc53c71aefc1847e4c667..1aafa0d785eb835dd50d6036a855fc2bbf8ea32e 100644 (file)
--- a/arch/arm/kernel/smp_scu.c
+++ b/arch/arm/kernel/smp_scu.c
@@ -28,7 +28,7 @@
   */
  unsigned int __init scu_get_core_count(void __iomem *scu_base)
  {
-       unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG);
+       unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
         return (ncores & 0x03) + 1;
  }
  
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
  #ifdef CONFIG_ARM_ERRATA_764369
         /* Cortex-A9 only */
         if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
-               scu_ctrl = __raw_readl(scu_base + 0x30);
+               scu_ctrl = readl_relaxed(scu_base + 0x30);
                 if (!(scu_ctrl & 1))
-                       __raw_writel(scu_ctrl | 0x1, scu_base + 0x30);
+                       writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
         }
  #endif
  
-       scu_ctrl = __raw_readl(scu_base + SCU_CTRL);
+       scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
         /* already enabled? */
         if (scu_ctrl & 1)
                 return;
  
         scu_ctrl |= 1;
-       __raw_writel(scu_ctrl, scu_base + SCU_CTRL);
+       writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
  
         /*
          * Ensure that the data accessed by CPU0 before the SCU was
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
         if (mode > 3 || mode == 1 || cpu > 3)
                 return -EINVAL;
  
-       val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
+       val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
         val |= mode;
-       __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu);
+       writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
  
         return 0;
  }
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c

index 83ccca303df83c4a1f40dce35f324153d979ad16..95d063620b76a6f706bccc23635537ed4bceb01a 100644 (file)
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -70,6 +70,40 @@ static inline void ipi_flush_bp_all(void *ignored)
         local_flush_bp_all();
  }
  
+#ifdef CONFIG_ARM_ERRATA_798181
+bool (*erratum_a15_798181_handler)(void);
+
+static bool erratum_a15_798181_partial(void)
+{
+       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+       dsb(ish);
+       return false;
+}
+
+static bool erratum_a15_798181_broadcast(void)
+{
+       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+       dsb(ish);
+       return true;
+}
+
+void erratum_a15_798181_init(void)
+{
+       unsigned int midr = read_cpuid_id();
+       unsigned int revidr = read_cpuid(CPUID_REVIDR);
+
+       /* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
+       if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
+           (revidr & 0x210) == 0x210) {
+               return;
+       }
+       if (revidr & 0x10)
+               erratum_a15_798181_handler = erratum_a15_798181_partial;
+       else
+               erratum_a15_798181_handler = erratum_a15_798181_broadcast;
+}
+#endif
+
  static void ipi_flush_tlb_a15_erratum(void *arg)
  {
         dmb();
@@ -80,7 +114,6 @@ static void broadcast_tlb_a15_erratum(void)
         if (!erratum_a15_798181())
                 return;
  
-       dummy_flush_tlb_a15_erratum();
         smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1);
  }
  
@@ -92,7 +125,6 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
         if (!erratum_a15_798181())
                 return;
  
-       dummy_flush_tlb_a15_erratum();
         this_cpu = get_cpu();
         a15_erratum_get_cpumask(this_cpu, mm, &mask);
         smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c

index 2985c9f0905d0e8667210e7bcdc0b51765781524..6591e26fc13f4eab5fcd3a7b1292a3ca7cf3bd92 100644 (file)
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
         case CLOCK_EVT_MODE_PERIODIC:
                 ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
                         | TWD_TIMER_CONTROL_PERIODIC;
-               __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
+               writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
                         twd_base + TWD_TIMER_LOAD);
                 break;
         case CLOCK_EVT_MODE_ONESHOT:
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
                 ctrl = 0;
         }
  
-       __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
  }
  
  static int twd_set_next_event(unsigned long evt,
                         struct clock_event_device *unused)
  {
-       unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
+       unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
  
         ctrl |= TWD_TIMER_CONTROL_ENABLE;
  
-       __raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
-       __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
+       writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
  
         return 0;
  }
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
   */
  static int twd_timer_ack(void)
  {
-       if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
-               __raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
+       if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
+               writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
                 return 1;
         }
  
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
                 waitjiffies += 5;
  
                                  /* enable, no interrupt or reload */
-               __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
+               writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
  
                                  /* maximum value */
-               __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
+               writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
  
                 while (get_jiffies_64() < waitjiffies)
                         udelay(10);
  
-               count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
+               count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
  
                 twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
  
@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
          * bother with the below.
          */
         if (per_cpu(percpu_setup_called, cpu)) {
-               __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+               writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
                 clockevents_register_device(clk);
                 enable_percpu_irq(clk->irq, 0);
                 return;
@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
          * The following is done once per CPU the first time .setup() is
          * called.
          */
-       __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
  
         clk->name = "local_timer";
         clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c

index 41cf3cbf756de473be3bb1ebb268445aeacadc5d..2835d35234ca459f4d7086a6f811ff35652a6a11 100644 (file)
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -10,7 +10,7 @@
  #include <asm/suspend.h>
  #include <asm/tlbflush.h>
  
-extern int __cpu_suspend(unsigned long, int (*)(unsigned long));
+extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
  extern void cpu_resume_mmu(void);
  
  #ifdef CONFIG_MMU
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
  int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
  {
         struct mm_struct *mm = current->active_mm;
+       u32 __mpidr = cpu_logical_map(smp_processor_id());
         int ret;
  
         if (!idmap_pgd)
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
          * resume (indicated by a zero return code), we need to switch
          * back to the correct page tables.
          */
-       ret = __cpu_suspend(arg, fn);
+       ret = __cpu_suspend(arg, fn, __mpidr);
         if (ret == 0) {
                 cpu_switch_mm(mm->pgd, mm);
                 local_flush_bp_all();
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
  #else
  int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
  {
-       return __cpu_suspend(arg, fn);
+       u32 __mpidr = cpu_logical_map(smp_processor_id());
+       return __cpu_suspend(arg, fn, __mpidr);
  }
  #define        idmap_pgd       NULL
  #endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c

index 8fcda140358d94d6056ea200b79b50bb6e7bde81..6125f259b7b5359072b0cd7a07e122fcd2bda4bd 100644 (file)
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -34,6 +34,7 @@
  #include <asm/unwind.h>
  #include <asm/tls.h>
  #include <asm/system_misc.h>
+#include <asm/opcodes.h>
  
  static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
  
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
  int is_valid_bugaddr(unsigned long pc)
  {
  #ifdef CONFIG_THUMB2_KERNEL
-       unsigned short bkpt;
+       u16 bkpt;
+       u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
  #else
-       unsigned long bkpt;
+       u32 bkpt;
+       u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
  #endif
  
         if (probe_kernel_address((unsigned *)pc, bkpt))
                 return 0;
  
-       return bkpt == BUG_INSTR_VALUE;
+       return bkpt == insn;
  }
  
  #endif
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
         if (processor_mode(regs) == SVC_MODE) {
  #ifdef CONFIG_THUMB2_KERNEL
                 if (thumb_mode(regs)) {
-                       instr = ((u16 *)pc)[0];
+                       instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
                         if (is_wide_instruction(instr)) {
-                               instr <<= 16;
-                               instr |= ((u16 *)pc)[1];
+                               u16 inst2;
+                               inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
+                               instr = __opcode_thumb32_compose(instr, inst2);
                         }
                 } else
  #endif
-                       instr = *(u32 *) pc;
+                       instr = __mem_to_opcode_arm(*(u32 *) pc);
         } else if (thumb_mode(regs)) {
                 if (get_user(instr, (u16 __user *)pc))
                         goto die_sig;
+               instr = __mem_to_opcode_thumb16(instr);
                 if (is_wide_instruction(instr)) {
                         unsigned int instr2;
                         if (get_user(instr2, (u16 __user *)pc+1))
                                 goto die_sig;
-                       instr <<= 16;
-                       instr |= instr2;
+                       instr2 = __mem_to_opcode_thumb16(instr2);
+                       instr = __opcode_thumb32_compose(instr, instr2);
                 }
         } else if (get_user(instr, (u32 __user *)pc)) {
+               instr = __mem_to_opcode_arm(instr);
                 goto die_sig;
         }
  
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index 9c697db2787e2a524cf59db4519f5272fa2918b0..aea7ccb8d3970f236c34feb7d668c67bcb925b01 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -65,7 +65,7 @@ static bool vgic_present;
  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  {
         BUG_ON(preemptible());
-       __get_cpu_var(kvm_arm_running_vcpu) = vcpu;
+       __this_cpu_write(kvm_arm_running_vcpu, vcpu);
  }
  
  /**
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
  {
         BUG_ON(preemptible());
-       return __get_cpu_var(kvm_arm_running_vcpu);
+       return __this_cpu_read(kvm_arm_running_vcpu);
  }
  
  /**
@@ -815,7 +815,7 @@ static void cpu_init_hyp_mode(void *dummy)
  
         boot_pgd_ptr = kvm_mmu_get_boot_httbr();
         pgd_ptr = kvm_mmu_get_httbr();
-       stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
+       stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
         hyp_stack_ptr = stack_page + PAGE_SIZE;
         vector_ptr = (unsigned long)__kvm_hyp_vector;
  
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h

index d6408d1ee543fe5e3ceabbcda01b25efb07676ba..e0c68d5bb7dc25dd3fa93dc0fa1b3899f5b09019 100644 (file)
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -10,6 +10,11 @@ UNWIND(      .fnstart        )
         and     r3, r0, #31             @ Get bit offset
         mov     r0, r0, lsr #5
         add     r1, r1, r0, lsl #2      @ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7
+       .arch_extension mp
+       ALT_SMP(W(pldw) [r1])
+       ALT_UP(W(nop))
+#endif
         mov     r3, r2, lsl r3
  1:     ldrex   r2, [r1]
         \instr  r2, r2, r3
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c

index 025f742dd4df6bf79b279babd264980d851f01d5..3e58d710013c3ad9b377fc76e6dad58f377e88a7 100644 (file)
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -18,6 +18,7 @@
  #include <linux/hardirq.h> /* for in_atomic() */
  #include <linux/gfp.h>
  #include <linux/highmem.h>
+#include <linux/hugetlb.h>
  #include <asm/current.h>
  #include <asm/page.h>
  
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
                 return 0;
  
         pmd = pmd_offset(pud, addr);
-       if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+       if (unlikely(pmd_none(*pmd)))
+               return 0;
+
+       /*
+        * A pmd can be bad if it refers to a HugeTLB or THP page.
+        *
+        * Both THP and HugeTLB pages have the same pmd layout
+        * and should not be manipulated by the pte functions.
+        *
+        * Lock the page table for the destination and check
+        * to see that it's still huge and whether or not we will
+        * need to fault on write, or if we have a splitting THP.
+        */
+       if (unlikely(pmd_thp_or_huge(*pmd))) {
+               ptl = &current->mm->page_table_lock;
+               spin_lock(ptl);
+               if (unlikely(!pmd_thp_or_huge(*pmd)
+                       || pmd_hugewillfault(*pmd)
+                       || pmd_trans_splitting(*pmd))) {
+                       spin_unlock(ptl);
+                       return 0;
+               }
+
+               *ptep = NULL;
+               *ptlp = ptl;
+               return 1;
+       }
+
+       if (unlikely(pmd_bad(*pmd)))
                 return 0;
  
         pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
                 from += tocopy;
                 n -= tocopy;
  
-               pte_unmap_unlock(pte, ptl);
+               if (pte)
+                       pte_unmap_unlock(pte, ptl);
+               else
+                       spin_unlock(ptl);
         }
         if (!atomic)
                 up_read(&current->mm->mmap_sem);
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
                 addr += tocopy;
                 n -= tocopy;
  
-               pte_unmap_unlock(pte, ptl);
+               if (pte)
+                       pte_unmap_unlock(pte, ptl);
+               else
+                       spin_unlock(ptl);
         }
         up_read(&current->mm->mmap_sem);
  
diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c

index 1fd2cf097e30fcfc10ce6ff7f200180340464496..eb1fa5c84723df5062fa54a154ef737653fa19b4 100644 (file)
--- a/arch/arm/mach-footbridge/netwinder-hw.c
+++ b/arch/arm/mach-footbridge/netwinder-hw.c
@@ -692,14 +692,14 @@ static void netwinder_led_set(struct led_classdev *cdev,
         unsigned long flags;
         u32 reg;
  
-       spin_lock_irqsave(&nw_gpio_lock, flags);
+       raw_spin_lock_irqsave(&nw_gpio_lock, flags);
         reg = nw_gpio_read();
         if (b != LED_OFF)
                 reg &= ~led->mask;
         else
                 reg |= led->mask;
         nw_gpio_modify_op(led->mask, reg);
-       spin_unlock_irqrestore(&nw_gpio_lock, flags);
+       raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
  }
  
  static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
@@ -709,9 +709,9 @@ static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
         unsigned long flags;
         u32 reg;
  
-       spin_lock_irqsave(&nw_gpio_lock, flags);
+       raw_spin_lock_irqsave(&nw_gpio_lock, flags);
         reg = nw_gpio_read();
-       spin_unlock_irqrestore(&nw_gpio_lock, flags);
+       raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
  
         return (reg & led->mask) ? LED_OFF : LED_FULL;
  }
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig

index fe98df44579cfa5c22c77ebd53dd9efbee15cf1e..08332d84144052899382d6d2543dc4d754779e6f 100644 (file)
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -4,11 +4,12 @@ config ARCH_HIGHBANK
         select ARCH_HAS_CPUFREQ
         select ARCH_HAS_HOLES_MEMORYMODEL
         select ARCH_HAS_OPP
+       select ARCH_SUPPORTS_BIG_ENDIAN
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select ARM_AMBA
         select ARM_ERRATA_764369
         select ARM_ERRATA_775420
-       select ARM_ERRATA_798181
+       select ARM_ERRATA_798181 if SMP
         select ARM_GIC
         select ARM_PSCI
         select ARM_TIMER_SP804
diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig

index 30e1ebe3a8916e7f4fb9eb00397c180ea4786ff6..c342dc4e8a45c43097f6c28e1e45ba848e57a7f3 100644 (file)
--- a/arch/arm/mach-ixp4xx/Kconfig
+++ b/arch/arm/mach-ixp4xx/Kconfig
@@ -1,9 +1,5 @@
  if ARCH_IXP4XX
  
-config ARCH_SUPPORTS_BIG_ENDIAN
-       bool
-       default y
-
  menu "Intel IXP4xx Implementation Options"
  
  comment "IXP4xx Platforms"
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig

index 9eb63d7246023e061ff80a91b2a600f691906c58..5e269d7263cea17b31cfd9c4294a0a3a8ce9d84f 100644 (file)
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -1,5 +1,6 @@
  config ARCH_MVEBU
         bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
+       select ARCH_SUPPORTS_BIG_ENDIAN
         select CLKSRC_MMIO
         select COMMON_CLK
         select GENERIC_CLOCKEVENTS
diff --git a/arch/arm/mach-mvebu/coherency_ll.S b/arch/arm/mach-mvebu/coherency_ll.S

index 5476669ba9056ff80d63fab31d8f266ef25a2652..ee7598fe75db873dc81843610939d189969833a3 100644 (file)
--- a/arch/arm/mach-mvebu/coherency_ll.S
+++ b/arch/arm/mach-mvebu/coherency_ll.S
@@ -20,6 +20,8 @@
  #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
  #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
  
+#include <asm/assembler.h>
+
         .text
  /*
   * r0: Coherency fabric base register address
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
         /* Create bit by cpu index */
         mov     r3, #(1 << 24)
         lsl     r1, r3, r1
+ARM_BE8(rev    r1, r1)
  
         /* Add CPU to SMP group - Atomic */
         add     r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
diff --git a/arch/arm/mach-mvebu/headsmp.S b/arch/arm/mach-mvebu/headsmp.S

index 8a1b0c96e9ece9a05ba3c78a04e51c5d65c3f240..3dd80df428f7edbf92885011a64141bf472add8c 100644 (file)
--- a/arch/arm/mach-mvebu/headsmp.S
+++ b/arch/arm/mach-mvebu/headsmp.S
@@ -21,12 +21,16 @@
  #include <linux/linkage.h>
  #include <linux/init.h>
  
+#include <asm/assembler.h>
+
  /*
   * Armada XP specific entry point for secondary CPUs.
   * We add the CPU to the coherency fabric and then jump to secondary
   * startup
   */
  ENTRY(armada_xp_secondary_startup)
+ ARM_BE8(setend        be )                    @ go BE8 if entered LE
+
         /* Get coherency fabric base physical address */
         adr     r0, 1f
         ldr     r1, [r0]
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c

index e838ba27e443c1dd0b54042c3e4afd882ae22265..c9808c6841526204144e36a273596e6955696dc8 100644 (file)
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -512,6 +512,9 @@ static void __init assabet_map_io(void)
          * Its called GPCLKR0 in my SA1110 manual.
          */
         Ser1SDCR0 |= SDCR0_SUS;
+       MSC1 = (MSC1 & ~0xffff) |
+               MSC_NonBrst | MSC_32BitStMem |
+               MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0);
  
         if (!machine_has_neponset())
                 sa1100_register_uart_fns(&assabet_port_fns);
diff --git a/arch/arm/mach-sa1100/include/mach/gpio.h b/arch/arm/mach-sa1100/include/mach/gpio.h

deleted file mode 100644 (file)

index 6a9eecf..0000000
--- a/arch/arm/mach-sa1100/include/mach/gpio.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * arch/arm/mach-sa1100/include/mach/gpio.h
- *
- * SA1100 GPIO wrappers for arch-neutral GPIO calls
- *
- * Written by Philipp Zabel <philipp.zabel@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#ifndef __ASM_ARCH_SA1100_GPIO_H
-#define __ASM_ARCH_SA1100_GPIO_H
-
-#include <linux/io.h>
-#include <mach/hardware.h>
-#include <asm/irq.h>
-#include <asm-generic/gpio.h>
-
-#define __ARM_GPIOLIB_COMPLEX
-
-static inline int gpio_get_value(unsigned gpio)
-{
-       if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-               return GPLR & GPIO_GPIO(gpio);
-       else
-               return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-       if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-               if (value)
-                       GPSR = GPIO_GPIO(gpio);
-               else
-                       GPCR = GPIO_GPIO(gpio);
-       else
-               __gpio_set_value(gpio, value);
-}
-
-#define gpio_cansleep  __gpio_cansleep
-
-#endif
diff --git a/arch/arm/mach-sa1100/include/mach/h3xxx.h b/arch/arm/mach-sa1100/include/mach/h3xxx.h

index 7d9df16f04a2276ccceba5b8315cf296988a2334..c810620db53d60235916e8ecf25c6402f2b984b1 100644 (file)
--- a/arch/arm/mach-sa1100/include/mach/h3xxx.h
+++ b/arch/arm/mach-sa1100/include/mach/h3xxx.h
@@ -13,6 +13,8 @@
  #ifndef _INCLUDE_H3XXX_H_
  #define _INCLUDE_H3XXX_H_
  
+#include "hardware.h" /* Gives GPIO_MAX */
+
  /* Physical memory regions corresponding to chip selects */
  #define H3600_EGPIO_PHYS       (SA1100_CS5_PHYS + 0x01000000)
  #define H3600_BANK_2_PHYS      SA1100_CS2_PHYS
diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c

index bcbc94540e4574c605b99a82f4e642db9afda26b..41e476e571d7f8b2be7a78b312fe3d699c8eebe8 100644 (file)
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -19,6 +19,7 @@
  
  #include <mach/hardware.h>
  #include <asm/setup.h>
+#include <asm/irq.h>
  
  #include <asm/mach-types.h>
  #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig

index 0bf04a0bca9d5c4d5d831b730de89cff332bb7b4..09e740f58b274184f0cd22b282def431d037b6d9 100644 (file)
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -51,7 +51,7 @@ config ARCH_TEGRA_3x_SOC
  
  config ARCH_TEGRA_114_SOC
         bool "Enable support for Tegra114 family"
-       select ARM_ERRATA_798181
+       select ARM_ERRATA_798181 if SMP
         select ARM_L1_CACHE_SHIFT_6
         select HAVE_ARM_ARCH_TIMER
         select PINCTRL_TEGRA114
diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig

index d7e7422527cac791938e4006873bad4fc8883d64..cbbb81e0e5096f7214e178fbed58b746a8dbf41b 100644 (file)
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -1,6 +1,7 @@
  config ARCH_VEXPRESS
         bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
         select ARCH_REQUIRE_GPIOLIB
+       select ARCH_SUPPORTS_BIG_ENDIAN
         select ARM_AMBA
         select ARM_GIC
         select ARM_TIMER_SP804
diff --git a/arch/arm/mach-vexpress/dcscb.c b/arch/arm/mach-vexpress/dcscb.c

index 3a6384c6c4356129733962ec286a8fae5d83f779..14d499688736b3c816c082f3216aa12503005b64 100644 (file)
--- a/arch/arm/mach-vexpress/dcscb.c
+++ b/arch/arm/mach-vexpress/dcscb.c
@@ -133,38 +133,8 @@ static void dcscb_power_down(void)
         if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
                 arch_spin_unlock(&dcscb_lock);
  
-               /*
-                * Flush all cache levels for this cluster.
-                *
-                * To do so we do:
-                * - Clear the SCTLR.C bit to prevent further cache allocations
-                * - Flush the whole cache
-                * - Clear the ACTLR "SMP" bit to disable local coherency
-                *
-                * Let's do it in the safest possible way i.e. with
-                * no memory access within the following sequence
-                * including to the stack.
-                *
-                * Note: fp is preserved to the stack explicitly prior doing
-                * this since adding it to the clobber list is incompatible
-                * with having CONFIG_FRAME_POINTER=y.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_all \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               /* Flush all cache levels for this cluster. */
+               v7_exit_coherency_flush(all);
  
                 /*
                  * This is a harmless no-op.  On platforms with a real
@@ -183,26 +153,8 @@ static void dcscb_power_down(void)
         } else {
                 arch_spin_unlock(&dcscb_lock);
  
-               /*
-                * Flush the local CPU cache.
-                * Let's do it in the safest possible way as above.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_louis \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               /* Disable and flush the local CPU cache. */
+               v7_exit_coherency_flush(louis);
         }
  
         __mcpm_cpu_down(cpu, cluster);
diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c

index e6eb4819291241f30b51d5e7b58c14d1d07c0d32..4eb92ebfd95322e50c1428f7310c8c19e65abd2b 100644 (file)
--- a/arch/arm/mach-vexpress/tc2_pm.c
+++ b/arch/arm/mach-vexpress/tc2_pm.c
@@ -156,32 +156,7 @@ static void tc2_pm_down(u64 residency)
                         : : "r" (0x400) );
                 }
  
-               /*
-                * We need to disable and flush the whole (L1 and L2) cache.
-                * Let's do it in the safest possible way i.e. with
-                * no memory access within the following sequence
-                * including the stack.
-                *
-                * Note: fp is preserved to the stack explicitly prior doing
-                * this since adding it to the clobber list is incompatible
-                * with having CONFIG_FRAME_POINTER=y.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_all \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               v7_exit_coherency_flush(all);
  
                 cci_disable_port_by_cpu(mpidr);
  
@@ -197,26 +172,7 @@ static void tc2_pm_down(u64 residency)
  
                 arch_spin_unlock(&tc2_pm_lock);
  
-               /*
-                * We need to disable and flush only the L1 cache.
-                * Let's do it in the safest possible way as above.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_louis \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               v7_exit_coherency_flush(louis);
         }
  
         __mcpm_cpu_down(cpu, cluster);
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig

index cd2c88e7a8f7557bfe299a7a6b364c395a969428..1f8fed94c2a499939354258fc61339a10d229a4f 100644 (file)
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
         help
           This option allows the use of custom mandatory barriers
           included via the mach/barriers.h file.
+
+config ARCH_SUPPORTS_BIG_ENDIAN
+       bool
+       help
+         This option specifies the architecture can support big endian
+         operation.
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S

index 80741992a9fcff0b98d963ec3465bf407c51c5f6..3815a8262af070b98f33d61ac31908961ee06eb2 100644 (file)
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
         bne     do_DataAbort
         bic     r1, r1, #1 << 11                @ clear bit 11 of FSR
         ldr     r3, [r4]                        @ read aborted ARM instruction
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r3, r3
-#endif
+ ARM_BE8(rev   r3, r3)
+
         do_ldrd_abort tmp=ip, insn=r3
         tst     r3, #1 << 20                    @ L = 0 -> write
         orreq   r1, r1, #1 << 11                @ yes.
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c

index 6f4585b89078749e39383d1aee50f51040624a74..924036473b16f437019002b8afac36a2a41e4010 100644 (file)
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -25,6 +25,7 @@
  #include <asm/cp15.h>
  #include <asm/system_info.h>
  #include <asm/unaligned.h>
+#include <asm/opcodes.h>
  
  #include "fault.h"
  
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
         if (thumb_mode(regs)) {
                 u16 *ptr = (u16 *)(instrptr & ~1);
                 fault = probe_kernel_address(ptr, tinstr);
+               tinstr = __mem_to_opcode_thumb16(tinstr);
                 if (!fault) {
                         if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
                             IS_T32(tinstr)) {
                                 /* Thumb-2 32-bit */
                                 u16 tinst2 = 0;
                                 fault = probe_kernel_address(ptr + 1, tinst2);
-                               instr = (tinstr << 16) | tinst2;
+                               tinst2 = __mem_to_opcode_thumb16(tinst2);
+                               instr = __opcode_thumb32_compose(tinstr, tinst2);
                                 thumb2_32b = 1;
                         } else {
                                 isize = 2;
                                 instr = thumb2arm(tinstr);
                         }
                 }
-       } else
+       } else {
                 fault = probe_kernel_address(instrptr, instr);
+               instr = __mem_to_opcode_arm(instr);
+       }
  
         if (fault) {
                 type = TYPE_FAULT;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c

index 644d91f73b00f3c5663b9e273d91d51a50218dd5..79f8b39801a8e570bd86ea80e566ee939bbb600b 100644 (file)
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -707,7 +707,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
  void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                     gfp_t gfp, struct dma_attrs *attrs)
  {
-       pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+       pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
         void *memory;
  
         if (dma_alloc_from_coherent(dev, size, handle, &memory))
@@ -720,7 +720,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
  static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
         dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
  {
-       pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+       pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
         void *memory;
  
         if (dma_alloc_from_coherent(dev, size, handle, &memory))
diff --git a/arch/arm/mm/extable.c b/arch/arm/mm/extable.c

index 9d285626bc7da4fe3e2be2e952c38923c05f5204..312e15e6d00b8a6b909747305166e1bc7f581bf0 100644 (file)
--- a/arch/arm/mm/extable.c
+++ b/arch/arm/mm/extable.c
@@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs)
         const struct exception_table_entry *fixup;
  
         fixup = search_exception_tables(instruction_pointer(regs));
-       if (fixup)
+       if (fixup) {
                 regs->ARM_pc = fixup->fixup;
+#ifdef CONFIG_THUMB2_KERNEL
+               /* Clear the IT state to avoid nasty surprises in the fixup */
+               regs->ARM_cpsr &= ~PSR_IT_MASK;
+#endif
+       }
  
         return fixup != NULL;
  }
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c

index 83cb3ac27095146f3f60c04047c6b212856a73b2..8e0e52eb76b57d7f9d4208fafbcdf024be369c75 100644 (file)
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -10,6 +10,7 @@
  #include <asm/system_info.h>
  
  pgd_t *idmap_pgd;
+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
  
  #ifdef CONFIG_ARM_LPAE
  static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
         unsigned long addr, end;
         unsigned long next;
  
-       addr = virt_to_phys(text_start);
-       end = virt_to_phys(text_end);
+       addr = virt_to_idmap(text_start);
+       end = virt_to_idmap(text_end);
+       pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
  
         prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
  
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
         if (!idmap_pgd)
                 return -ENOMEM;
  
-       pr_info("Setting up static identity map for 0x%p - 0x%p\n",
-               __idmap_text_start, __idmap_text_end);
         identity_mapping_add(idmap_pgd, __idmap_text_start,
                              __idmap_text_end, 0);
  
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c

index 0c6356255fe31122f2527a0a2947439e69b0e953..d27158c38eb0b190b869e028b93d8265fb90969e 100644 (file)
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -202,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size)
  }
  
  /*
- * We don't use supersection mappings for mmap() on /dev/mem, which
- * means that we can't map the memory area above the 4G barrier into
- * userspace.
+ * Do not allow /dev/mem mappings beyond the supported physical range.
   */
  int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
  {
-       return !(pfn + (size >> PAGE_SHIFT) > 0x00100000);
+       return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
  }
  
  #ifdef CONFIG_STRICT_DEVMEM
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c

index b1d17eeb59b895cd429e762d082d6c5c56c3ff57..78eeeca78f5ab331707fcd73b4956c503c2d880b 100644 (file)
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -28,6 +28,8 @@
  #include <asm/highmem.h>
  #include <asm/system_info.h>
  #include <asm/traps.h>
+#include <asm/procinfo.h>
+#include <asm/memory.h>
  
  #include <asm/mach/arch.h>
  #include <asm/mach/map.h>
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
         }
  }
  
+#ifdef CONFIG_ARM_LPAE
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+       pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
+       unsigned long map_start, map_end;
+       pgd_t *pgd0, *pgdk;
+       pud_t *pud0, *pudk, *pud_start;
+       pmd_t *pmd0, *pmdk;
+       phys_addr_t phys;
+       int i;
+
+       if (!(mdesc->init_meminfo))
+               return;
+
+       /* remap kernel code and data */
+       map_start = init_mm.start_code;
+       map_end   = init_mm.brk;
+
+       /* get a handle on things... */
+       pgd0 = pgd_offset_k(0);
+       pud_start = pud0 = pud_offset(pgd0, 0);
+       pmd0 = pmd_offset(pud0, 0);
+
+       pgdk = pgd_offset_k(map_start);
+       pudk = pud_offset(pgdk, map_start);
+       pmdk = pmd_offset(pudk, map_start);
+
+       mdesc->init_meminfo();
+
+       /* Run the patch stub to update the constants */
+       fixup_pv_table(&__pv_table_begin,
+               (&__pv_table_end - &__pv_table_begin) << 2);
+
+       /*
+        * Cache cleaning operations for self-modifying code
+        * We should clean the entries by MVA but running a
+        * for loop over every pv_table entry pointer would
+        * just complicate the code.
+        */
+       flush_cache_louis();
+       dsb();
+       isb();
+
+       /* remap level 1 table */
+       for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
+               set_pud(pud0,
+                       __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
+               pmd0 += PTRS_PER_PMD;
+       }
+
+       /* remap pmds for kernel mapping */
+       phys = __pa(map_start) & PMD_MASK;
+       do {
+               *pmdk++ = __pmd(phys | pmdprot);
+               phys += PMD_SIZE;
+       } while (phys < map_end);
+
+       flush_cache_all();
+       cpu_switch_mm(pgd0, &init_mm);
+       cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+       local_flush_bp_all();
+       local_flush_tlb_all();
+}
+
+#else
+
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+       if (mdesc->init_meminfo)
+               mdesc->init_meminfo();
+}
+
+#endif
+
  /*
   * paging_init() sets up the page tables, initialises the zone memory
   * maps, and sets up the zero page, bad page and bad page tables.
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c

index 34d4ab217babb39b563e493cb4d4643963d33a64..5c668b7a31f97e6df35dcec53b79d5a70a9d1d0b 100644 (file)
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -295,6 +295,15 @@ void __init sanity_check_meminfo(void)
         high_memory = __va(end - 1) + 1;
  }
  
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+}
+
  /*
   * paging_init() sets up the page tables, initialises the zone memory
   * maps, and sets up the zero page, bad page and bad page tables.
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S

index 1128064fddcbd6c11754d5c2b4241e8b51c7d6a5..45dc29f85d56ca79ae7fa6076f3cee103465e15b 100644 (file)
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -220,9 +220,7 @@ __v6_setup:
  #endif /* CONFIG_MMU */
         adr     r5, v6_crval
         ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
+ ARM_BE8(orr   r6, r6, #1 << 25)               @ big-endian page tables
         mrc     p15, 0, r0, c1, c0, 0           @ read control register
         bic     r0, r0, r5                      @ clear bits them
         orr     r0, r0, r6                      @ set them
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S

index c63d9bdee51e65cab7bbad13632207702db41273..60920f62fdf5994f04477bc94aba09dcd349385a 100644 (file)
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -367,9 +367,7 @@ __v7_setup:
  #endif
         adr     r5, v7_crval
         ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
+ ARM_BE8(orr   r6, r6, #1 << 25)               @ big-endian page tables
  #ifdef CONFIG_SWP_EMULATE
         orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
         bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c

index 99b44e0e8d866983dd60d4b8324bf29010679e77..9ed155ad0f97c42c80fd903919a51dac8b0298d4 100644 (file)
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -19,6 +19,7 @@
  #include <linux/if_vlan.h>
  #include <asm/cacheflush.h>
  #include <asm/hwcap.h>
+#include <asm/opcodes.h>
  
  #include "bpf_jit_32.h"
  
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
  
  static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
  {
+       inst |= (cond << 28);
+       inst = __opcode_to_mem_arm(inst);
+
         if (ctx->target != NULL)
-               ctx->target[ctx->idx] = inst | (cond << 28);
+               ctx->target[ctx->idx] = inst;
  
         ctx->idx++;
  }
diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S

index 2677bc3762d7ce418b4c116b709dadf0f8e53697..40f27e52de759aed3ab7d3f7e912bf4feab3a358 100644 (file)
--- a/arch/arm/plat-versatile/headsmp.S
+++ b/arch/arm/plat-versatile/headsmp.S
@@ -10,6 +10,7 @@
   */
  #include <linux/linkage.h>
  #include <linux/init.h>
+#include <asm/assembler.h>
  
  /*
   * Realview/Versatile Express specific entry point for secondary CPUs.
@@ -17,6 +18,7 @@
   * until we're ready for them to initialise.
   */
  ENTRY(versatile_secondary_startup)
+ ARM_BE8(setend        be)
         mrc     p15, 0, r0, c0, c0, 5
         bic     r0, #0xff000000
         adr     r4, 1f
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c

index 52b8f40b1c73d48d206d497a6fadc381b5ddd888..2f37e1d6cb4500f57386800c8d282df85a6431b2 100644 (file)
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -642,9 +642,9 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp,
  static int vfp_hotplug(struct notifier_block *b, unsigned long action,
         void *hcpu)
  {
-       if (action == CPU_DYING || action == CPU_DYING_FROZEN) {
-               vfp_force_reload((long)hcpu, current_thread_info());
-       } else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
+       if (action == CPU_DYING || action == CPU_DYING_FROZEN)
+               vfp_current_hw_state[(long)hcpu] = NULL;
+       else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
                 vfp_enable(NULL);
         return NOTIFY_OK;
  }
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h

index 8363644685711895cbb049c735b6ebb15cf81ff1..01de5aaa3edc112a42548a1b3906e4353bcce949 100644 (file)
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -126,20 +126,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
         return oldval;
  }
  
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long tmp, tmp2;
-
-       asm volatile("// atomic_clear_mask\n"
-"1:    ldxr    %0, %2\n"
-"      bic     %0, %0, %3\n"
-"      stxr    %w1, %0, %2\n"
-"      cbnz    %w1, 1b"
-       : "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr)
-       : "Ir" (mask)
-       : "cc");
-}
-
  #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
  
  static inline int __atomic_add_unless(atomic_t *v, int a, int u)
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c

index cbfacf7fb4387160862c906c38ad72dea395305c..6a0a9b132d7af11714348f3950990a9854d56ba6 100644 (file)
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -27,7 +27,6 @@
  #include <linux/uaccess.h>
  
  #include <asm/debug-monitors.h>
-#include <asm/local.h>
  #include <asm/cputype.h>
  #include <asm/system_misc.h>
  
@@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_disable);
   * Keep track of debug users on each core.
   * The ref counts are per-cpu so we use a local_t type.
   */
-static DEFINE_PER_CPU(local_t, mde_ref_count);
-static DEFINE_PER_CPU(local_t, kde_ref_count);
+static DEFINE_PER_CPU(int, mde_ref_count);
+static DEFINE_PER_CPU(int, kde_ref_count);
  
  void enable_debug_monitors(enum debug_el el)
  {
@@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el el)
  
         WARN_ON(preemptible());
  
-       if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+       if (this_cpu_inc_return(mde_ref_count) == 1)
                 enable = DBG_MDSCR_MDE;
  
         if (el == DBG_ACTIVE_EL1 &&
-           local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+           this_cpu_inc_return(kde_ref_count) == 1)
                 enable |= DBG_MDSCR_KDE;
  
         if (enable && debug_enabled) {
@@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_el el)
  
         WARN_ON(preemptible());
  
-       if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
+       if (this_cpu_dec_return(mde_ref_count) == 0)
                 disable = ~DBG_MDSCR_MDE;
  
         if (el == DBG_ACTIVE_EL1 &&
-           local_dec_and_test(&__get_cpu_var(kde_ref_count)))
+           this_cpu_dec_return(kde_ref_count) == 0)
                 disable &= ~DBG_MDSCR_KDE;
  
         if (disable) {
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c

index 329218ca9ffbb74456155a44f827a7231e70da8a..ff516f6691e48b77067cfe789d04e1680e79e15f 100644 (file)
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
                 /* Breakpoint */
                 ctrl_reg = AARCH64_DBG_REG_BCR;
                 val_reg = AARCH64_DBG_REG_BVR;
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                 max_slots = core_num_brps;
                 reg_enable = !debug_info->bps_disabled;
         } else {
                 /* Watchpoint */
                 ctrl_reg = AARCH64_DBG_REG_WCR;
                 val_reg = AARCH64_DBG_REG_WVR;
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                 max_slots = core_num_wrps;
                 reg_enable = !debug_info->wps_disabled;
         }
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
         if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
                 /* Breakpoint */
                 base = AARCH64_DBG_REG_BCR;
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                 max_slots = core_num_brps;
         } else {
                 /* Watchpoint */
                 base = AARCH64_DBG_REG_WCR;
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                 max_slots = core_num_wrps;
         }
  
@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable)
  
         switch (reg) {
         case AARCH64_DBG_REG_BCR:
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                 max_slots = core_num_brps;
                 break;
         case AARCH64_DBG_REG_WCR:
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                 max_slots = core_num_wrps;
                 break;
         default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
         struct debug_info *debug_info;
         struct arch_hw_breakpoint_ctrl ctrl;
  
-       slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+       slots = this_cpu_ptr(bp_on_reg);
         addr = instruction_pointer(regs);
         debug_info = &current->thread.debug;
  
@@ -596,7 +596,7 @@ unlock:
                         user_enable_single_step(current);
         } else {
                 toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
-               kernel_step = &__get_cpu_var(stepping_kernel_bp);
+               kernel_step = this_cpu_ptr(&stepping_kernel_bp);
  
                 if (*kernel_step != ARM_KERNEL_STEP_NONE)
                         return 0;
@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
         struct arch_hw_breakpoint *info;
         struct arch_hw_breakpoint_ctrl ctrl;
  
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
         debug_info = &current->thread.debug;
  
         for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +698,7 @@ unlock:
                         user_enable_single_step(current);
         } else {
                 toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
-               kernel_step = &__get_cpu_var(stepping_kernel_bp);
+               kernel_step = this_cpu_ptr(&stepping_kernel_bp);
  
                 if (*kernel_step != ARM_KERNEL_STEP_NONE)
                         return 0;
@@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
         struct debug_info *debug_info = &current->thread.debug;
         int handled_exception = 0, *kernel_step;
  
-       kernel_step = &__get_cpu_var(stepping_kernel_bp);
+       kernel_step = this_cpu_ptr(&stepping_kernel_bp);
  
         /*
          * Called from single-step exception handler.
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c

index 5d14470452ac5959712ef9927b3a093bf8975bf3..0e63c98d224c5756c58ba0a4145c40273bd87e86 100644 (file)
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1044,7 +1044,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
          */
         regs = get_irq_regs();
  
-       cpuc = &__get_cpu_var(cpu_hw_events);
+       cpuc = this_cpu_ptr(&cpu_hw_events);
         for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
                 struct perf_event *event = cpuc->events[idx];
                 struct hw_perf_event *hwc;
@@ -1258,7 +1258,7 @@ device_initcall(register_pmu_driver);
  
  static struct pmu_hw_events *armpmu_get_cpu_events(void)
  {
-       return &__get_cpu_var(cpu_hw_events);
+       return this_cpu_ptr(&cpu_hw_events);
  }
  
  static void __init cpu_pmu_init(struct arm_pmu *armpmu)
diff --git a/crypto/Kconfig b/crypto/Kconfig

index 69ce573f1224560b4f5c7532e21053b27c651da4..71f337aefa3905feaca892b7ac3b49b4bcb411e3 100644 (file)
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
  
           See <http://csrc.nist.gov/encryption/aes/> for more information.
  
+config CRYPTO_AES_ARM_BS
+       tristate "Bit sliced AES using NEON instructions"
+       depends on ARM && KERNEL_MODE_NEON
+       select CRYPTO_ALGAPI
+       select CRYPTO_AES_ARM
+       select CRYPTO_ABLK_HELPER
+       help
+         Use a faster and more secure NEON based implementation of AES in CBC,
+         CTR and XTS modes
+
+         Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
+         and for XTS mode encryption, CBC and XTS mode decryption speedup is
+         around 25%. (CBC encryption speed is not affected by this driver.)
+         This implementation does not rely on any lookup tables so it is
+         believed to be invulnerable to cache timing attacks.
+
  config CRYPTO_ANUBIS
         tristate "Anubis cipher algorithm"
         select CRYPTO_ALGAPI
diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c

index bb5b90e8e7687a0b71d33aae92f7050f741a6fa9..b6739cb78e320755e4fb9480c875b6cc2890dd4e 100644 (file)
--- a/drivers/bus/arm-cci.c
+++ b/drivers/bus/arm-cci.c
@@ -852,7 +852,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
  
         /* Enable the CCI port */
  "      ldr     r0, [r0, %[offsetof_port_phys]] \n"
-"      mov     r3, #"__stringify(CCI_ENABLE_REQ)" \n"
+"      mov     r3, %[cci_enable_req]\n"                   
  "      str     r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n"
  
         /* poll the status reg for completion */
@@ -860,7 +860,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
  "      ldr     r0, [r1] \n"
  "      ldr     r0, [r0, r1]            @ cci_ctrl_base \n"
  "4:    ldr     r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n"
-"      tst     r1, #1 \n"
+"      tst     r1, %[cci_control_status_bits] \n"                      
  "      bne     4b \n"
  
  "      mov     r0, #0 \n"
@@ -873,6 +873,8 @@ asmlinkage void __naked cci_enable_port_for_self(void)
  "7:    .word   cci_ctrl_phys - . \n"
         : :
         [sizeof_cpu_port] "i" (sizeof(cpu_port)),
+       [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ),
+       [cci_control_status_bits] "i" cpu_to_le32(1),
  #ifndef __ARMEB__
         [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)),
  #else
diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c

index 8ea3b33d4b40bb4b96ad7edd83f6dac7e219aa03..a90be34e4d5c24569dc64a320e59d0a9474e0908 100644 (file)
--- a/drivers/gpio/gpio-sa1100.c
+++ b/drivers/gpio/gpio-sa1100.c
@@ -10,7 +10,7 @@
  #include <linux/gpio.h>
  #include <linux/init.h>
  #include <linux/module.h>
-
+#include <linux/io.h>
  #include <mach/hardware.h>
  #include <mach/irqs.h>
  
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c

index d0e948084eaf7e2211c323f5976ecc08e5681136..9031171c141b52c5e9175fdbf6eec9bd0c4224b3 100644 (file)
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
         if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
                 return -EINVAL;
  
+       raw_spin_lock(&irq_controller_lock);
         mask = 0xff << shift;
         bit = gic_cpu_map[cpu] << shift;
-
-       raw_spin_lock(&irq_controller_lock);
         val = readl_relaxed(reg) & ~mask;
         writel_relaxed(val | bit, reg);
         raw_spin_unlock(&irq_controller_lock);
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
  void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
  {
         int cpu;
-       unsigned long map = 0;
+       unsigned long flags, map = 0;
+
+       raw_spin_lock_irqsave(&irq_controller_lock, flags);
  
         /* Convert our logical CPU mask into a physical one. */
         for_each_cpu(cpu, mask)
@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
  
         /* this always happens on GIC0 */
         writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+
+       raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_BL_SWITCHER
+/*
+ * gic_send_sgi - send a SGI directly to given CPU interface number
+ *
+ * cpu_id: the ID for the destination CPU interface
+ * irq: the IPI number to send a SGI for
+ */
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
+{
+       BUG_ON(cpu_id >= NR_GIC_CPU_IF);
+       cpu_id = 1 << cpu_id;
+       /* this always happens on GIC0 */
+       writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+}
+
+/*
+ * gic_get_cpu_id - get the CPU interface ID for the specified CPU
+ *
+ * @cpu: the logical CPU number to get the GIC ID for.
+ *
+ * Return the CPU interface ID for the given logical CPU number,
+ * or -1 if the CPU number is too large or the interface ID is
+ * unknown (more than one bit set).
+ */
+int gic_get_cpu_id(unsigned int cpu)
+{
+       unsigned int cpu_bit;
+
+       if (cpu >= NR_GIC_CPU_IF)
+               return -1;
+       cpu_bit = gic_cpu_map[cpu];
+       if (cpu_bit & (cpu_bit - 1))
+               return -1;
+       return __ffs(cpu_bit);
  }
+
+/*
+ * gic_migrate_target - migrate IRQs to another CPU interface
+ *
+ * @new_cpu_id: the CPU target ID to migrate IRQs to
+ *
+ * Migrate all peripheral interrupts with a target matching the current CPU
+ * to the interface corresponding to @new_cpu_id.  The CPU interface mapping
+ * is also updated.  Targets to other CPU interfaces are unchanged.
+ * This must be called with IRQs locally disabled.
+ */
+void gic_migrate_target(unsigned int new_cpu_id)
+{
+       unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
+       void __iomem *dist_base;
+       int i, ror_val, cpu = smp_processor_id();
+       u32 val, cur_target_mask, active_mask;
+
+       if (gic_nr >= MAX_GIC_NR)
+               BUG();
+
+       dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+       if (!dist_base)
+               return;
+       gic_irqs = gic_data[gic_nr].gic_irqs;
+
+       cur_cpu_id = __ffs(gic_cpu_map[cpu]);
+       cur_target_mask = 0x01010101 << cur_cpu_id;
+       ror_val = (cur_cpu_id - new_cpu_id) & 31;
+
+       raw_spin_lock(&irq_controller_lock);
+
+       /* Update the target interface for this logical CPU */
+       gic_cpu_map[cpu] = 1 << new_cpu_id;
+
+       /*
+        * Find all the peripheral interrupts targetting the current
+        * CPU interface and migrate them to the new CPU interface.
+        * We skip DIST_TARGET 0 to 7 as they are read-only.
+        */
+       for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
+               val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
+               active_mask = val & cur_target_mask;
+               if (active_mask) {
+                       val &= ~active_mask;
+                       val |= ror32(active_mask, ror_val);
+                       writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
+               }
+       }
+
+       raw_spin_unlock(&irq_controller_lock);
+
+       /*
+        * Now let's migrate and clear any potential SGIs that might be
+        * pending for us (cur_cpu_id).  Since GIC_DIST_SGI_PENDING_SET
+        * is a banked register, we can only forward the SGI using
+        * GIC_DIST_SOFTINT.  The original SGI source is lost but Linux
+        * doesn't use that information anyway.
+        *
+        * For the same reason we do not adjust SGI source information
+        * for previously sent SGIs by us to other CPUs either.
+        */
+       for (i = 0; i < 16; i += 4) {
+               int j;
+               val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
+               if (!val)
+                       continue;
+               writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
+               for (j = i; j < i + 4; j++) {
+                       if (val & 0xff)
+                               writel_relaxed((1 << (new_cpu_id + 16)) | j,
+                                               dist_base + GIC_DIST_SOFTINT);
+                       val >>= 8;
+               }
+       }
+}
+
+/*
+ * gic_get_sgir_physaddr - get the physical address for the SGI register
+ *
+ * REturn the physical address of the SGI register to be used
+ * by some early assembly code when the kernel is not yet available.
+ */
+static unsigned long gic_dist_physaddr;
+
+unsigned long gic_get_sgir_physaddr(void)
+{
+       if (!gic_dist_physaddr)
+               return 0;
+       return gic_dist_physaddr + GIC_DIST_SOFTINT;
+}
+
+void __init gic_init_physaddr(struct device_node *node)
+{
+       struct resource res;
+       if (of_address_to_resource(node, 0, &res) == 0) {
+               gic_dist_physaddr = res.start;
+               pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
+       }
+}
+
+#else
+#define gic_init_physaddr(node)  do { } while (0)
  #endif
  
  static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
                 percpu_offset = 0;
  
         gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+       if (!gic_cnt)
+               gic_init_physaddr(node);
  
         if (parent) {
                 irq = irq_of_parse_and_map(node, 0);
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c

index c3785edc0e92c851d3c36a0481a1b2936f92fc63..d135c76c4855b825175370e215979bd528e4b2d4 100644 (file)
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -62,6 +62,7 @@ static unsigned int fmax = 515633;
   * @signal_direction: input/out direction of bus signals can be indicated
   * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
   * @busy_detect: true if busy detection on dat0 is supported
+ * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
   */
  struct variant_data {
         unsigned int            clkreg;
@@ -76,6 +77,7 @@ struct variant_data {
         bool                    signal_direction;
         bool                    pwrreg_clkgate;
         bool                    busy_detect;
+       bool                    pwrreg_nopower;
  };
  
  static struct variant_data variant_arm = {
@@ -109,6 +111,7 @@ static struct variant_data variant_u300 = {
         .pwrreg_powerup         = MCI_PWR_ON,
         .signal_direction       = true,
         .pwrreg_clkgate         = true,
+       .pwrreg_nopower         = true,
  };
  
  static struct variant_data variant_nomadik = {
@@ -121,6 +124,7 @@ static struct variant_data variant_nomadik = {
         .pwrreg_powerup         = MCI_PWR_ON,
         .signal_direction       = true,
         .pwrreg_clkgate         = true,
+       .pwrreg_nopower         = true,
  };
  
  static struct variant_data variant_ux500 = {
@@ -135,6 +139,7 @@ static struct variant_data variant_ux500 = {
         .signal_direction       = true,
         .pwrreg_clkgate         = true,
         .busy_detect            = true,
+       .pwrreg_nopower         = true,
  };
  
  static struct variant_data variant_ux500v2 = {
@@ -150,6 +155,7 @@ static struct variant_data variant_ux500v2 = {
         .signal_direction       = true,
         .pwrreg_clkgate         = true,
         .busy_detect            = true,
+       .pwrreg_nopower         = true,
  };
  
  static int mmci_card_busy(struct mmc_host *mmc)
@@ -189,6 +195,21 @@ static int mmci_validate_data(struct mmci_host *host,
         return 0;
  }
  
+static void mmci_reg_delay(struct mmci_host *host)
+{
+       /*
+        * According to the spec, at least three feedback clock cycles
+        * of max 52 MHz must pass between two writes to the MMCICLOCK reg.
+        * Three MCLK clock cycles must pass between two MMCIPOWER reg writes.
+        * Worst delay time during card init is at 100 kHz => 30 us.
+        * Worst delay time when up and running is at 25 MHz => 120 ns.
+        */
+       if (host->cclk < 25000000)
+               udelay(30);
+       else
+               ndelay(120);
+}
+
  /*
   * This must be called with host->lock held
   */
@@ -1264,6 +1285,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
  
         mmci_set_clkreg(host, ios->clock);
         mmci_write_pwrreg(host, pwr);
+       mmci_reg_delay(host);
  
         spin_unlock_irqrestore(&host->lock, flags);
  
@@ -1510,23 +1532,6 @@ static int mmci_probe(struct amba_device *dev,
                 mmc->f_max = min(host->mclk, fmax);
         dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
  
-       host->pinctrl = devm_pinctrl_get(&dev->dev);
-       if (IS_ERR(host->pinctrl)) {
-               ret = PTR_ERR(host->pinctrl);
-               goto clk_disable;
-       }
-
-       host->pins_default = pinctrl_lookup_state(host->pinctrl,
-                       PINCTRL_STATE_DEFAULT);
-
-       /* enable pins to be muxed in and configured */
-       if (!IS_ERR(host->pins_default)) {
-               ret = pinctrl_select_state(host->pinctrl, host->pins_default);
-               if (ret)
-                       dev_warn(&dev->dev, "could not set default pins\n");
-       } else
-               dev_warn(&dev->dev, "could not get default pinstate\n");
-
         /* Get regulators and the supported OCR mask */
         mmc_regulator_get_supply(mmc);
         if (!mmc->ocr_avail)
@@ -1760,6 +1765,41 @@ static int mmci_resume(struct device *dev)
  #endif
  
  #ifdef CONFIG_PM_RUNTIME
+static void mmci_save(struct mmci_host *host)
+{
+       unsigned long flags;
+
+       if (host->variant->pwrreg_nopower) {
+               spin_lock_irqsave(&host->lock, flags);
+
+               writel(0, host->base + MMCIMASK0);
+               writel(0, host->base + MMCIDATACTRL);
+               writel(0, host->base + MMCIPOWER);
+               writel(0, host->base + MMCICLOCK);
+               mmci_reg_delay(host);
+
+               spin_unlock_irqrestore(&host->lock, flags);
+       }
+
+}
+
+static void mmci_restore(struct mmci_host *host)
+{
+       unsigned long flags;
+
+       if (host->variant->pwrreg_nopower) {
+               spin_lock_irqsave(&host->lock, flags);
+
+               writel(host->clk_reg, host->base + MMCICLOCK);
+               writel(host->datactrl_reg, host->base + MMCIDATACTRL);
+               writel(host->pwr_reg, host->base + MMCIPOWER);
+               writel(MCI_IRQENABLE, host->base + MMCIMASK0);
+               mmci_reg_delay(host);
+
+               spin_unlock_irqrestore(&host->lock, flags);
+       }
+}
+
  static int mmci_runtime_suspend(struct device *dev)
  {
         struct amba_device *adev = to_amba_device(dev);
@@ -1767,6 +1807,8 @@ static int mmci_runtime_suspend(struct device *dev)
  
         if (mmc) {
                 struct mmci_host *host = mmc_priv(mmc);
+               pinctrl_pm_select_sleep_state(dev);
+               mmci_save(host);
                 clk_disable_unprepare(host->clk);
         }
  
@@ -1781,6 +1823,8 @@ static int mmci_runtime_resume(struct device *dev)
         if (mmc) {
                 struct mmci_host *host = mmc_priv(mmc);
                 clk_prepare_enable(host->clk);
+               mmci_restore(host);
+               pinctrl_pm_select_default_state(dev);
         }
  
         return 0;
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h

index 69080fab637520af2a9ea8b4a6ba49ae52b0344f..168bc72f7a94a9b662d7c0a97775c781d4d769aa 100644 (file)
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -200,10 +200,6 @@ struct mmci_host {
         struct sg_mapping_iter  sg_miter;
         unsigned int            size;
  
-       /* pinctrl handles */
-       struct pinctrl          *pinctrl;
-       struct pinctrl_state    *pins_default;
-
  #ifdef CONFIG_DMA_ENGINE
         /* DMA stuff */
         struct dma_chan         *dma_current;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h

index 682df0e1954a96718ae1e439db73441681101145..63b5eff0a80f647ffc0c6bdb6064ac56d482fc34 100644 (file)
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -21,7 +21,7 @@
  #include <linux/resource.h>
  #include <linux/regulator/consumer.h>
  
-#define AMBA_NR_IRQS   2
+#define AMBA_NR_IRQS   9
  #define AMBA_CID       0xb105f00d
  
  struct clk;
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h

index 0e5d9ecdb2b672d901b47f184a4b720e604317e2..cac496b1e279293164066ea0204c87309169bd49 100644 (file)
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -31,6 +31,8 @@
  #define GIC_DIST_TARGET                        0x800
  #define GIC_DIST_CONFIG                        0xc00
  #define GIC_DIST_SOFTINT               0xf00
+#define GIC_DIST_SGI_PENDING_CLEAR     0xf10
+#define GIC_DIST_SGI_PENDING_SET       0xf20
  
  #define GICH_HCR                       0x0
  #define GICH_VTR                       0x4
@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start,
         gic_init_bases(nr, start, dist, cpu, 0, NULL);
  }
  
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
+int gic_get_cpu_id(unsigned int cpu);
+void gic_migrate_target(unsigned int new_cpu_id);
+unsigned long gic_get_sgir_physaddr(void);
+
  #endif /* __ASSEMBLY */
  
  #endif
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h

new file mode 100644 (file)

index 0000000..f76dd4d
--- /dev/null
+++ b/include/trace/events/power_cpu_migrate.h
@@ -0,0 +1,67 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_CPU_MIGRATE_H
+
+#include <linux/tracepoint.h>
+
+#define __cpu_migrate_proto                    \
+       TP_PROTO(u64 timestamp,                 \
+                u32 cpu_hwid)
+#define __cpu_migrate_args                     \
+       TP_ARGS(timestamp,                      \
+               cpu_hwid)
+
+DECLARE_EVENT_CLASS(cpu_migrate,
+
+       __cpu_migrate_proto,
+       __cpu_migrate_args,
+
+       TP_STRUCT__entry(
+               __field(u64,    timestamp               )
+               __field(u32,    cpu_hwid                )
+       ),
+
+       TP_fast_assign(
+               __entry->timestamp = timestamp;
+               __entry->cpu_hwid = cpu_hwid;
+       ),
+
+       TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
+               (unsigned long long)__entry->timestamp,
+               (unsigned long)__entry->cpu_hwid
+       )
+);
+
+#define __define_cpu_migrate_event(name)               \
+       DEFINE_EVENT(cpu_migrate, cpu_migrate_##name,   \
+               __cpu_migrate_proto,                    \
+               __cpu_migrate_args                      \
+       )
+
+__define_cpu_migrate_event(begin);
+__define_cpu_migrate_event(finish);
+__define_cpu_migrate_event(current);
+
+#undef __define_cpu_migrate
+#undef __cpu_migrate_proto
+#undef __cpu_migrate_args
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+
+/*
+ * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
+ * a whole-cluster migration:
+ */
+#define CPU_MIGRATE_ALL_CPUS 0x80000000U
+#endif
+
+#endif /* _TRACE_POWER_CPU_MIGRATE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE power_cpu_migrate
+#include <trace/define_trace.h>
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile

index 15130b50dfe3be2fea6a975a102283f6d33e4887..fe9b61e322a557b063ecc78eccb9a87c8de73dda 100644 (file)
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -2,3 +2,6 @@ ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
  endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
+endif
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h

new file mode 100644 (file)

index 0000000..2a1cfde
--- /dev/null
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -0,0 +1,54 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include "../../util/types.h"
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1)
+#define PERF_REG_IP    PERF_REG_ARM_PC
+#define PERF_REG_SP    PERF_REG_ARM_SP
+
+static inline const char *perf_reg_name(int id)
+{
+       switch (id) {
+       case PERF_REG_ARM_R0:
+               return "r0";
+       case PERF_REG_ARM_R1:
+               return "r1";
+       case PERF_REG_ARM_R2:
+               return "r2";
+       case PERF_REG_ARM_R3:
+               return "r3";
+       case PERF_REG_ARM_R4:
+               return "r4";
+       case PERF_REG_ARM_R5:
+               return "r5";
+       case PERF_REG_ARM_R6:
+               return "r6";
+       case PERF_REG_ARM_R7:
+               return "r7";
+       case PERF_REG_ARM_R8:
+               return "r8";
+       case PERF_REG_ARM_R9:
+               return "r9";
+       case PERF_REG_ARM_R10:
+               return "r10";
+       case PERF_REG_ARM_FP:
+               return "fp";
+       case PERF_REG_ARM_IP:
+               return "ip";
+       case PERF_REG_ARM_SP:
+               return "sp";
+       case PERF_REG_ARM_LR:
+               return "lr";
+       case PERF_REG_ARM_PC:
+               return "pc";
+       default:
+               return NULL;
+       }
+
+       return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c

new file mode 100644 (file)

index 0000000..da3dc95
--- /dev/null
+++ b/tools/perf/arch/arm/util/unwind.c
@@ -0,0 +1,48 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+int unwind__arch_reg_id(int regnum)
+{
+       switch (regnum) {
+       case UNW_ARM_R0:
+               return PERF_REG_ARM_R0;
+       case UNW_ARM_R1:
+               return PERF_REG_ARM_R1;
+       case UNW_ARM_R2:
+               return PERF_REG_ARM_R2;
+       case UNW_ARM_R3:
+               return PERF_REG_ARM_R3;
+       case UNW_ARM_R4:
+               return PERF_REG_ARM_R4;
+       case UNW_ARM_R5:
+               return PERF_REG_ARM_R5;
+       case UNW_ARM_R6:
+               return PERF_REG_ARM_R6;
+       case UNW_ARM_R7:
+               return PERF_REG_ARM_R7;
+       case UNW_ARM_R8:
+               return PERF_REG_ARM_R8;
+       case UNW_ARM_R9:
+               return PERF_REG_ARM_R9;
+       case UNW_ARM_R10:
+               return PERF_REG_ARM_R10;
+       case UNW_ARM_R11:
+               return PERF_REG_ARM_FP;
+       case UNW_ARM_R12:
+               return PERF_REG_ARM_IP;
+       case UNW_ARM_R13:
+               return PERF_REG_ARM_SP;
+       case UNW_ARM_R14:
+               return PERF_REG_ARM_LR;
+       case UNW_ARM_R15:
+               return PERF_REG_ARM_PC;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+
+       return -EINVAL;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 58b2d37ae23a19db3719ebfa29c98a79905a4c49..f5905f2b197d81d7cb2c8e0ee5a3f2cc257d5f20 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -31,6 +31,10 @@ ifeq ($(ARCH),x86_64)
    endif
    NO_PERF_REGS := 0
  endif
+ifeq ($(ARCH),arm)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS = -lunwind -lunwind-arm
+endif
  
  ifeq ($(NO_PERF_REGS),0)
    CFLAGS += -DHAVE_PERF_REGS_SUPPORT
@@ -305,8 +309,7 @@ ifndef NO_LIBELF
    endif # NO_DWARF
  endif # NO_LIBELF
  
-# There's only x86 (both 32 and 64) support for CFI unwind so far
-ifneq ($(ARCH),x86)
+ifeq ($(LIBUNWIND_LIBS),)
    NO_LIBUNWIND := 1
  endif
  
@@ -322,8 +325,13 @@ ifndef NO_LIBUNWIND
    endif
  
    ifneq ($(feature-libunwind), 1)
-    msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99);
+    msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
      NO_LIBUNWIND := 1
+  else
+    ifneq ($(feature-libunwind-debug-frame), 1)
+      msg := $(warning No debug_frame support found in libunwind);
+      CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+    endif
    endif
  endif
  
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile

index c803f17fb9869648c97ba9efe93e3a05b4e4329d..e8e195f49a4ee7695a80b2a4d535fb35a5aac504 100644 (file)
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile
@@ -23,6 +23,7 @@ FILES=                                        \
         test-libpython-version          \
         test-libslang                   \
         test-libunwind                  \
+       test-libunwind-debug-frame      \
         test-on-exit                    \
         test-stackprotector-all         \
         test-stackprotector             \
diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/perf/config/feature-checks/test-all.c

index 59e7a705e146d4eb8ea029ebf74878411f8e3898..799865b607721e247b847711cf656ef375ada341 100644 (file)
--- a/tools/perf/config/feature-checks/test-all.c
+++ b/tools/perf/config/feature-checks/test-all.c
@@ -49,6 +49,10 @@
  # include "test-libunwind.c"
  #undef main
  
+#define main main_test_libunwind_debug_frame
+# include "test-libunwind-debug-frame.c"
+#undef main
+
  #define main main_test_libaudit
  # include "test-libaudit.c"
  #undef main
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c

new file mode 100644 (file)

index 0000000..0ef8087
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
@@ -0,0 +1,16 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                unw_word_t ip, unw_word_t segbase,
+                                const char *obj_name, unw_word_t start,
+                                unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+       dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+       return 0;
+}
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c

index 2f891f7e70bf9251c849780ec008ded14a2776c9..5390d0b8862a680e147cf52dd8bed22cfbba333f 100644 (file)
--- a/tools/perf/util/unwind.c
+++ b/tools/perf/util/unwind.c
@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
  
  #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
  
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                unw_word_t ip,
+                                unw_word_t segbase,
+                                const char *obj_name, unw_word_t start,
+                                unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
  #define DW_EH_PE_FORMAT_MASK   0x0f    /* format of the encoded value */
  #define DW_EH_PE_APPL_MASK     0x70    /* how the value is to be applied */
  
@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
         return 0;
  }
  
-static int read_unwind_spec(struct dso *dso, struct machine *machine,
-                           u64 *table_data, u64 *segbase, u64 *fde_count)
+static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
+                                    u64 *table_data, u64 *segbase,
+                                    u64 *fde_count)
  {
         int ret = -EINVAL, fd;
         u64 offset;
@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
         if (fd < 0)
                 return -EINVAL;
  
+       /* Check the .eh_frame section for unwinding info */
         offset = elf_section_offset(fd, ".eh_frame_hdr");
         close(fd);
  
@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
                                           table_data, segbase,
                                           fde_count);
  
-       /* TODO .debug_frame check if eh_frame_hdr fails */
         return ret;
  }
  
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int read_unwind_spec_debug_frame(struct dso *dso,
+                                       struct machine *machine, u64 *offset)
+{
+       int fd = dso__data_fd(dso, machine);
+
+       if (fd < 0)
+               return -EINVAL;
+
+       /* Check the .debug_frame section for unwinding info */
+       *offset = elf_section_offset(fd, ".debug_frame");
+       close(fd);
+
+       if (*offset)
+               return 0;
+
+       return -EINVAL;
+}
+#endif
+
  static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
  {
         struct addr_location al;
@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
  
         pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
  
-       if (read_unwind_spec(map->dso, ui->machine,
-                            &table_data, &segbase, &fde_count))
-               return -EINVAL;
+       /* Check the .eh_frame section for unwinding info */
+       if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
+                                      &table_data, &segbase, &fde_count)) {
+               memset(&di, 0, sizeof(di));
+               di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
+               di.start_ip = map->start;
+               di.end_ip   = map->end;
+               di.u.rti.segbase    = map->start + segbase;
+               di.u.rti.table_data = map->start + table_data;
+               di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
+                                     / sizeof(unw_word_t);
+               return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                need_unwind_info, arg);
+       }
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+       /* Check the .debug_frame section for unwinding info */
+       if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
+               memset(&di, 0, sizeof(di));
+               dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
+                                      map->start, map->end);
+               return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                need_unwind_info, arg);
+       }
+#endif
  
-       memset(&di, 0, sizeof(di));
-       di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
-       di.start_ip = map->start;
-       di.end_ip   = map->end;
-       di.u.rti.segbase    = map->start + segbase;
-       di.u.rti.table_data = map->start + table_data;
-       di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
-                             / sizeof(unw_word_t);
-       return dwarf_search_unwind_table(as, ip, &di, pi,
-                                        need_unwind_info, arg);
+       return -EINVAL;
  }
  
  static int access_fpreg(unw_addr_space_t __maybe_unused as,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
arch/arm/Kconfig		patch \| blob \| history
arch/arm/Kconfig.debug		patch \| blob \| history
arch/arm/Makefile		patch \| blob \| history
arch/arm/boot/compressed/head.S		patch \| blob \| history
arch/arm/common/Makefile		patch \| blob \| history
arch/arm/common/bL_switcher.c	[new file with mode: 0644]	patch \| blob
arch/arm/common/bL_switcher_dummy_if.c	[new file with mode: 0644]	patch \| blob
arch/arm/common/mcpm_entry.c		patch \| blob \| history
arch/arm/common/mcpm_head.S		patch \| blob \| history
arch/arm/common/mcpm_platsmp.c		patch \| blob \| history
arch/arm/common/timer-sp.c		patch \| blob \| history
arch/arm/configs/h3600_defconfig		patch \| blob \| history
arch/arm/crypto/.gitignore	[new file with mode: 0644]	patch \| blob
arch/arm/crypto/Makefile		patch \| blob \| history
arch/arm/crypto/aes_glue.c		patch \| blob \| history
arch/arm/crypto/aes_glue.h	[new file with mode: 0644]	patch \| blob
arch/arm/crypto/aesbs-core.S_shipped	[new file with mode: 0644]	patch \| blob
arch/arm/crypto/aesbs-glue.c	[new file with mode: 0644]	patch \| blob
arch/arm/crypto/bsaes-armv7.pl	[new file with mode: 0644]	patch \| blob
arch/arm/include/asm/Kbuild		patch \| blob \| history
arch/arm/include/asm/assembler.h		patch \| blob \| history
arch/arm/include/asm/atomic.h		patch \| blob \| history
arch/arm/include/asm/bL_switcher.h	[new file with mode: 0644]	patch \| blob
arch/arm/include/asm/bug.h		patch \| blob \| history
arch/arm/include/asm/cacheflush.h		patch \| blob \| history
arch/arm/include/asm/cmpxchg.h		patch \| blob \| history
arch/arm/include/asm/cputype.h		patch \| blob \| history
arch/arm/include/asm/hardirq.h		patch \| blob \| history
arch/arm/include/asm/hardware/coresight.h		patch \| blob \| history
arch/arm/include/asm/kgdb.h		patch \| blob \| history
arch/arm/include/asm/mach/arch.h		patch \| blob \| history
arch/arm/include/asm/mcpm.h		patch \| blob \| history
arch/arm/include/asm/memory.h		patch \| blob \| history
arch/arm/include/asm/mmu.h		patch \| blob \| history
arch/arm/include/asm/pgtable-2level.h		patch \| blob \| history
arch/arm/include/asm/pgtable-3level.h		patch \| blob \| history
arch/arm/include/asm/processor.h		patch \| blob \| history
arch/arm/include/asm/setup.h		patch \| blob \| history
arch/arm/include/asm/smp.h		patch \| blob \| history
arch/arm/include/asm/spinlock.h		patch \| blob \| history
arch/arm/include/asm/spinlock_types.h		patch \| blob \| history
arch/arm/include/asm/tlbflush.h		patch \| blob \| history
arch/arm/include/asm/unified.h		patch \| blob \| history
arch/arm/include/debug/efm32.S	[new file with mode: 0644]	patch \| blob
arch/arm/include/debug/msm.S		patch \| blob \| history
arch/arm/include/debug/pl01x.S		patch \| blob \| history
arch/arm/include/uapi/asm/Kbuild		patch \| blob \| history
arch/arm/include/uapi/asm/perf_regs.h	[new file with mode: 0644]	patch \| blob
arch/arm/kernel/Makefile		patch \| blob \| history
arch/arm/kernel/armksyms.c		patch \| blob \| history
arch/arm/kernel/entry-armv.S		patch \| blob \| history
arch/arm/kernel/entry-common.S		patch \| blob \| history
arch/arm/kernel/head.S		patch \| blob \| history
arch/arm/kernel/hw_breakpoint.c		patch \| blob \| history
arch/arm/kernel/kprobes.c		patch \| blob \| history
arch/arm/kernel/module.c		patch \| blob \| history
arch/arm/kernel/perf_event.c		patch \| blob \| history
arch/arm/kernel/perf_event_cpu.c		patch \| blob \| history
arch/arm/kernel/perf_regs.c	[new file with mode: 0644]	patch \| blob
arch/arm/kernel/setup.c		patch \| blob \| history
arch/arm/kernel/signal.c		patch \| blob \| history
arch/arm/kernel/sigreturn_codes.S	[new file with mode: 0644]	patch \| blob
arch/arm/kernel/sleep.S		patch \| blob \| history
arch/arm/kernel/smp.c		patch \| blob \| history
arch/arm/kernel/smp_scu.c		patch \| blob \| history
arch/arm/kernel/smp_tlb.c		patch \| blob \| history
arch/arm/kernel/smp_twd.c		patch \| blob \| history
arch/arm/kernel/suspend.c		patch \| blob \| history
arch/arm/kernel/traps.c		patch \| blob \| history
arch/arm/kvm/arm.c		patch \| blob \| history
arch/arm/lib/bitops.h		patch \| blob \| history
arch/arm/lib/uaccess_with_memcpy.c		patch \| blob \| history
arch/arm/mach-footbridge/netwinder-hw.c		patch \| blob \| history
arch/arm/mach-highbank/Kconfig		patch \| blob \| history
arch/arm/mach-ixp4xx/Kconfig		patch \| blob \| history
arch/arm/mach-mvebu/Kconfig		patch \| blob \| history
arch/arm/mach-mvebu/coherency_ll.S		patch \| blob \| history
arch/arm/mach-mvebu/headsmp.S		patch \| blob \| history
arch/arm/mach-sa1100/assabet.c		patch \| blob \| history
arch/arm/mach-sa1100/include/mach/gpio.h	[deleted file]	patch \| blob \| history
arch/arm/mach-sa1100/include/mach/h3xxx.h		patch \| blob \| history
arch/arm/mach-sa1100/simpad.c		patch \| blob \| history
arch/arm/mach-tegra/Kconfig		patch \| blob \| history
arch/arm/mach-vexpress/Kconfig		patch \| blob \| history
arch/arm/mach-vexpress/dcscb.c		patch \| blob \| history
arch/arm/mach-vexpress/tc2_pm.c		patch \| blob \| history
arch/arm/mm/Kconfig		patch \| blob \| history
arch/arm/mm/abort-ev6.S		patch \| blob \| history
arch/arm/mm/alignment.c		patch \| blob \| history
arch/arm/mm/dma-mapping.c		patch \| blob \| history
arch/arm/mm/extable.c		patch \| blob \| history
arch/arm/mm/idmap.c		patch \| blob \| history
arch/arm/mm/mmap.c		patch \| blob \| history
arch/arm/mm/mmu.c		patch \| blob \| history
arch/arm/mm/nommu.c		patch \| blob \| history
arch/arm/mm/proc-v6.S		patch \| blob \| history
arch/arm/mm/proc-v7.S		patch \| blob \| history
arch/arm/net/bpf_jit_32.c		patch \| blob \| history
arch/arm/plat-versatile/headsmp.S		patch \| blob \| history
arch/arm/vfp/vfpmodule.c		patch \| blob \| history
arch/arm64/include/asm/atomic.h		patch \| blob \| history
arch/arm64/kernel/debug-monitors.c		patch \| blob \| history
arch/arm64/kernel/hw_breakpoint.c		patch \| blob \| history
arch/arm64/kernel/perf_event.c		patch \| blob \| history
crypto/Kconfig		patch \| blob \| history
drivers/bus/arm-cci.c		patch \| blob \| history
drivers/gpio/gpio-sa1100.c		patch \| blob \| history
drivers/irqchip/irq-gic.c		patch \| blob \| history
drivers/mmc/host/mmci.c		patch \| blob \| history
drivers/mmc/host/mmci.h		patch \| blob \| history
include/linux/amba/bus.h		patch \| blob \| history
include/linux/irqchip/arm-gic.h		patch \| blob \| history
include/trace/events/power_cpu_migrate.h	[new file with mode: 0644]	patch \| blob
tools/perf/arch/arm/Makefile		patch \| blob \| history
tools/perf/arch/arm/include/perf_regs.h	[new file with mode: 0644]	patch \| blob
tools/perf/arch/arm/util/unwind.c	[new file with mode: 0644]	patch \| blob
tools/perf/config/Makefile		patch \| blob \| history
tools/perf/config/feature-checks/Makefile		patch \| blob \| history
tools/perf/config/feature-checks/test-all.c		patch \| blob \| history
tools/perf/config/feature-checks/test-libunwind-debug-frame.c	[new file with mode: 0644]	patch \| blob
tools/perf/util/unwind.c		patch \| blob \| history