]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 13 Nov 2013 23:51:29 +0000 (08:51 +0900)
Pull ARM updates from Russell King:
 "Included in this series are:

   1. BE8 (modern big endian) changes for ARM from Ben Dooks
   2. big.Little support from Nicolas Pitre and Dave Martin
   3. support for LPAE systems with all system memory above 4GB
   4. Perf updates from Will Deacon
   5. Additional prefetching and other performance improvements from Will.
   6. Neon-optimised AES implementation fro Ard.
   7. A number of smaller fixes scattered around the place.

  There is a rather horrid merge conflict in tools/perf - I was never
  notified of the conflict because it originally occurred between Will's
  tree and other stuff.  Consequently I have a resolution which Will
  forwarded me, which I'll forward on immediately after sending this
  mail.

  The other notable thing is I'm expecting some build breakage in the
  crypto stuff on ARM only with Ard's AES patches.  These were merged
  into a stable git branch which others had already pulled, so there's
  little I can do about this.  The problem is caused because these
  patches have a dependency on some code in the crypto git tree - I
  tried requesting a branch I can pull to resolve these, and all I got
  each time from the crypto people was "we'll revert our patches then"
  which would only make things worse since I still don't have the
  dependent patches.  I've no idea what's going on there or how to
  resolve that, and since I can't split these patches from the rest of
  this pull request, I'm rather stuck with pushing this as-is or
  reverting Ard's patches.

  Since it should "come out in the wash" I've left them in - the only
  build problems they seem to cause at the moment are with randconfigs,
  and since it's a new feature anyway.  However, if by -rc1 the
  dependencies aren't in, I think it'd be best to revert Ard's patches"

I resolved the perf conflict roughly as per the patch sent by Russell,
but there may be some differences.  Any errors are likely mine.  Let's
see how the crypto issues work out..

* 'for-linus' of git://git.linaro.org/people/rmk/linux-arm: (110 commits)
  ARM: 7868/1: arm/arm64: remove atomic_clear_mask() in "include/asm/atomic.h"
  ARM: 7867/1: include: asm: use 'int' instead of 'unsigned long' for 'oldval' in atomic_cmpxchg().
  ARM: 7866/1: include: asm: use 'long long' instead of 'u64' within atomic.h
  ARM: 7871/1: amba: Extend number of IRQS
  ARM: 7887/1: Don't smp_cross_call() on UP devices in arch_irq_work_raise()
  ARM: 7872/1: Support arch_irq_work_raise() via self IPIs
  ARM: 7880/1: Clear the IT state independent of the Thumb-2 mode
  ARM: 7878/1: nommu: Implement dummy early_paging_init()
  ARM: 7876/1: clear Thumb-2 IT state on exception handling
  ARM: 7874/2: bL_switcher: Remove cpu_hotplug_driver_{lock,unlock}()
  ARM: footbridge: fix build warnings for netwinder
  ARM: 7873/1: vfp: clear vfp_current_hw_state for dying cpu
  ARM: fix misplaced arch_virt_to_idmap()
  ARM: 7848/1: mcpm: Implement cpu_kill() to synchronise on powerdown
  ARM: 7847/1: mcpm: Factor out logical-to-physical CPU translation
  ARM: 7869/1: remove unused XSCALE_PMU Kconfig param
  ARM: 7864/1: Handle 64-bit memory in case of 32-bit phys_addr_t
  ARM: 7863/1: Let arm_add_memory() always use 64-bit arguments
  ARM: 7862/1: pcpu: replace __get_cpu_var_uses
  ARM: 7861/1: cacheflush: consolidate single-CPU ARMv7 cache disabling code
  ...

121 files changed:
arch/arm/Kconfig
arch/arm/Kconfig.debug
arch/arm/Makefile
arch/arm/boot/compressed/head.S
arch/arm/common/Makefile
arch/arm/common/bL_switcher.c [new file with mode: 0644]
arch/arm/common/bL_switcher_dummy_if.c [new file with mode: 0644]
arch/arm/common/mcpm_entry.c
arch/arm/common/mcpm_head.S
arch/arm/common/mcpm_platsmp.c
arch/arm/common/timer-sp.c
arch/arm/configs/h3600_defconfig
arch/arm/crypto/.gitignore [new file with mode: 0644]
arch/arm/crypto/Makefile
arch/arm/crypto/aes_glue.c
arch/arm/crypto/aes_glue.h [new file with mode: 0644]
arch/arm/crypto/aesbs-core.S_shipped [new file with mode: 0644]
arch/arm/crypto/aesbs-glue.c [new file with mode: 0644]
arch/arm/crypto/bsaes-armv7.pl [new file with mode: 0644]
arch/arm/include/asm/Kbuild
arch/arm/include/asm/assembler.h
arch/arm/include/asm/atomic.h
arch/arm/include/asm/bL_switcher.h [new file with mode: 0644]
arch/arm/include/asm/bug.h
arch/arm/include/asm/cacheflush.h
arch/arm/include/asm/cmpxchg.h
arch/arm/include/asm/cputype.h
arch/arm/include/asm/hardirq.h
arch/arm/include/asm/hardware/coresight.h
arch/arm/include/asm/kgdb.h
arch/arm/include/asm/mach/arch.h
arch/arm/include/asm/mcpm.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/mmu.h
arch/arm/include/asm/pgtable-2level.h
arch/arm/include/asm/pgtable-3level.h
arch/arm/include/asm/processor.h
arch/arm/include/asm/setup.h
arch/arm/include/asm/smp.h
arch/arm/include/asm/spinlock.h
arch/arm/include/asm/spinlock_types.h
arch/arm/include/asm/tlbflush.h
arch/arm/include/asm/unified.h
arch/arm/include/debug/efm32.S [new file with mode: 0644]
arch/arm/include/debug/msm.S
arch/arm/include/debug/pl01x.S
arch/arm/include/uapi/asm/Kbuild
arch/arm/include/uapi/asm/perf_regs.h [new file with mode: 0644]
arch/arm/kernel/Makefile
arch/arm/kernel/armksyms.c
arch/arm/kernel/entry-armv.S
arch/arm/kernel/entry-common.S
arch/arm/kernel/head.S
arch/arm/kernel/hw_breakpoint.c
arch/arm/kernel/kprobes.c
arch/arm/kernel/module.c
arch/arm/kernel/perf_event.c
arch/arm/kernel/perf_event_cpu.c
arch/arm/kernel/perf_regs.c [new file with mode: 0644]
arch/arm/kernel/setup.c
arch/arm/kernel/signal.c
arch/arm/kernel/sigreturn_codes.S [new file with mode: 0644]
arch/arm/kernel/sleep.S
arch/arm/kernel/smp.c
arch/arm/kernel/smp_scu.c
arch/arm/kernel/smp_tlb.c
arch/arm/kernel/smp_twd.c
arch/arm/kernel/suspend.c
arch/arm/kernel/traps.c
arch/arm/kvm/arm.c
arch/arm/lib/bitops.h
arch/arm/lib/uaccess_with_memcpy.c
arch/arm/mach-footbridge/netwinder-hw.c
arch/arm/mach-highbank/Kconfig
arch/arm/mach-ixp4xx/Kconfig
arch/arm/mach-mvebu/Kconfig
arch/arm/mach-mvebu/coherency_ll.S
arch/arm/mach-mvebu/headsmp.S
arch/arm/mach-sa1100/assabet.c
arch/arm/mach-sa1100/include/mach/gpio.h [deleted file]
arch/arm/mach-sa1100/include/mach/h3xxx.h
arch/arm/mach-sa1100/simpad.c
arch/arm/mach-tegra/Kconfig
arch/arm/mach-vexpress/Kconfig
arch/arm/mach-vexpress/dcscb.c
arch/arm/mach-vexpress/tc2_pm.c
arch/arm/mm/Kconfig
arch/arm/mm/abort-ev6.S
arch/arm/mm/alignment.c
arch/arm/mm/dma-mapping.c
arch/arm/mm/extable.c
arch/arm/mm/idmap.c
arch/arm/mm/mmap.c
arch/arm/mm/mmu.c
arch/arm/mm/nommu.c
arch/arm/mm/proc-v6.S
arch/arm/mm/proc-v7.S
arch/arm/net/bpf_jit_32.c
arch/arm/plat-versatile/headsmp.S
arch/arm/vfp/vfpmodule.c
arch/arm64/include/asm/atomic.h
arch/arm64/kernel/debug-monitors.c
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kernel/perf_event.c
crypto/Kconfig
drivers/bus/arm-cci.c
drivers/gpio/gpio-sa1100.c
drivers/irqchip/irq-gic.c
drivers/mmc/host/mmci.c
drivers/mmc/host/mmci.h
include/linux/amba/bus.h
include/linux/irqchip/arm-gic.h
include/trace/events/power_cpu_migrate.h [new file with mode: 0644]
tools/perf/arch/arm/Makefile
tools/perf/arch/arm/include/perf_regs.h [new file with mode: 0644]
tools/perf/arch/arm/util/unwind.c [new file with mode: 0644]
tools/perf/config/Makefile
tools/perf/config/feature-checks/Makefile
tools/perf/config/feature-checks/test-all.c
tools/perf/config/feature-checks/test-libunwind-debug-frame.c [new file with mode: 0644]
tools/perf/util/unwind.c

index acb80708accdb5d9af59aedb89c9a2bfb308e990..603d661b445d4a2690c7bcb5dbd9cb6d282cc0e6 100644 (file)
@@ -5,6 +5,7 @@ config ARM
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAVE_CUSTOM_GPIO_H
+       select ARCH_USE_CMPXCHG_LOCKREF
        select ARCH_WANT_IPC_PARSE_VERSION
        select BUILDTIME_EXTABLE_SORT if MMU
        select CLONE_BACKWARDS
@@ -51,6 +52,8 @@ config ARM
        select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
        select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
        select HAVE_PERF_EVENTS
+       select HAVE_PERF_REGS
+       select HAVE_PERF_USER_STACK_DUMP
        select HAVE_REGS_AND_STACK_ACCESS_API
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_UID16
@@ -481,6 +484,7 @@ config ARCH_IXP4XX
        bool "IXP4xx-based"
        depends on MMU
        select ARCH_HAS_DMA_SET_COHERENT_MASK
+       select ARCH_SUPPORTS_BIG_ENDIAN
        select ARCH_REQUIRE_GPIOLIB
        select CLKSRC_MMIO
        select CPU_XSCALE
@@ -688,7 +692,6 @@ config ARCH_SA1100
        select GENERIC_CLOCKEVENTS
        select HAVE_IDE
        select ISA
-       select NEED_MACH_GPIO_H
        select NEED_MACH_MEMORY_H
        select SPARSE_IRQ
        help
@@ -1064,11 +1067,6 @@ config IWMMXT
          Enable support for iWMMXt context switching at run time if
          running on a CPU that supports it.
 
-config XSCALE_PMU
-       bool
-       depends on CPU_XSCALE
-       default y
-
 config MULTI_IRQ_HANDLER
        bool
        help
@@ -1516,6 +1514,32 @@ config MCPM
          for (multi-)cluster based systems, such as big.LITTLE based
          systems.
 
+config BIG_LITTLE
+       bool "big.LITTLE support (Experimental)"
+       depends on CPU_V7 && SMP
+       select MCPM
+       help
+         This option enables support selections for the big.LITTLE
+         system architecture.
+
+config BL_SWITCHER
+       bool "big.LITTLE switcher support"
+       depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
+       select CPU_PM
+       select ARM_CPU_SUSPEND
+       help
+         The big.LITTLE "switcher" provides the core functionality to
+         transparently handle transition between a cluster of A15's
+         and a cluster of A7's in a big.LITTLE system.
+
+config BL_SWITCHER_DUMMY_IF
+       tristate "Simple big.LITTLE switcher user interface"
+       depends on BL_SWITCHER && DEBUG_KERNEL
+       help
+         This is a simple and dummy char dev interface to control
+         the big.LITTLE switcher core code.  It is meant for
+         debugging purposes only.
+
 choice
        prompt "Memory split"
        default VMSPLIT_3G
index d597c6b8488ba257fa37319d9afc7dc6362933a7..5765abf5ce84576d8de31df83d709160905d7b19 100644 (file)
@@ -318,6 +318,7 @@ choice
        config DEBUG_MSM_UART1
                bool "Kernel low-level debugging messages via MSM UART1"
                depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                help
                  Say Y here if you want the debug print routines to direct
                  their output to the first serial port on MSM devices.
@@ -325,6 +326,7 @@ choice
        config DEBUG_MSM_UART2
                bool "Kernel low-level debugging messages via MSM UART2"
                depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                help
                  Say Y here if you want the debug print routines to direct
                  their output to the second serial port on MSM devices.
@@ -332,6 +334,7 @@ choice
        config DEBUG_MSM_UART3
                bool "Kernel low-level debugging messages via MSM UART3"
                depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
+               select DEBUG_MSM_UART
                help
                  Say Y here if you want the debug print routines to direct
                  their output to the third serial port on MSM devices.
@@ -340,6 +343,7 @@ choice
                bool "Kernel low-level debugging messages via MSM 8660 UART"
                depends on ARCH_MSM8X60
                select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
                help
                  Say Y here if you want the debug print routines to direct
                  their output to the serial port on MSM 8660 devices.
@@ -348,10 +352,20 @@ choice
                bool "Kernel low-level debugging messages via MSM 8960 UART"
                depends on ARCH_MSM8960
                select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
                help
                  Say Y here if you want the debug print routines to direct
                  their output to the serial port on MSM 8960 devices.
 
+       config DEBUG_MSM8974_UART
+               bool "Kernel low-level debugging messages via MSM 8974 UART"
+               depends on ARCH_MSM8974
+               select MSM_HAS_DEBUG_UART_HS
+               select DEBUG_MSM_UART
+               help
+                 Say Y here if you want the debug print routines to direct
+                 their output to the serial port on MSM 8974 devices.
+
        config DEBUG_MVEBU_UART
                bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)"
                depends on ARCH_MVEBU
@@ -841,6 +855,20 @@ choice
                  options; the platform specific options are deprecated
                  and will be soon removed.
 
+       config DEBUG_LL_UART_EFM32
+               bool "Kernel low-level debugging via efm32 UART"
+               depends on ARCH_EFM32
+               help
+                 Say Y here if you want the debug print routines to direct
+                 their output to an UART or USART port on efm32 based
+                 machines. Use the following addresses for DEBUG_UART_PHYS:
+
+                   0x4000c000 | USART0
+                   0x4000c400 | USART1
+                   0x4000c800 | USART2
+                   0x4000e000 | UART0
+                   0x4000e400 | UART1
+
        config DEBUG_LL_UART_PL01X
                bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART"
                help
@@ -887,11 +915,16 @@ config DEBUG_STI_UART
        bool
        depends on ARCH_STI
 
+config DEBUG_MSM_UART
+       bool
+       depends on ARCH_MSM
+
 config DEBUG_LL_INCLUDE
        string
        default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250
        default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X
        default "debug/exynos.S" if DEBUG_EXYNOS_UART
+       default "debug/efm32.S" if DEBUG_LL_UART_EFM32
        default "debug/icedcc.S" if DEBUG_ICEDCC
        default "debug/imx.S" if DEBUG_IMX1_UART || \
                                 DEBUG_IMX25_UART || \
@@ -902,11 +935,7 @@ config DEBUG_LL_INCLUDE
                                 DEBUG_IMX53_UART ||\
                                 DEBUG_IMX6Q_UART || \
                                 DEBUG_IMX6SL_UART
-       default "debug/msm.S" if DEBUG_MSM_UART1 || \
-                                DEBUG_MSM_UART2 || \
-                                DEBUG_MSM_UART3 || \
-                                DEBUG_MSM8660_UART || \
-                                DEBUG_MSM8960_UART
+       default "debug/msm.S" if DEBUG_MSM_UART
        default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART
        default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1
        default "debug/sti.S" if DEBUG_STI_UART
@@ -959,6 +988,7 @@ config DEBUG_UART_PHYS
        default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2
        default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3
        default 0x20201000 if DEBUG_BCM2835
+       default 0x4000e400 if DEBUG_LL_UART_EFM32
        default 0x40090000 if ARCH_LPC32XX
        default 0x40100000 if DEBUG_PXA_UART1
        default 0x42000000 if ARCH_GEMINI
@@ -989,6 +1019,7 @@ config DEBUG_UART_PHYS
        default 0xfff36000 if DEBUG_HIGHBANK_UART
        default 0xfffff700 if ARCH_IOP33X
        depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
+               DEBUG_LL_UART_EFM32 || \
                DEBUG_UART_8250 || DEBUG_UART_PL01X
 
 config DEBUG_UART_VIRT
index 8b667132d7b419bb572549d3f5149c7f9e503860..c99b1086d83dfa8c0c407bab392ff5bb12927ab5 100644 (file)
@@ -16,6 +16,7 @@ LDFLAGS               :=
 LDFLAGS_vmlinux        :=-p --no-undefined -X
 ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
 LDFLAGS_vmlinux        += --be8
+LDFLAGS_MODULE += --be8
 endif
 
 OBJCOPYFLAGS   :=-O binary -R .comment -S
index 75189f13cf54c63d0db56b98ad2c41834a30893e..066b03480b63caacc04abf1ef0ec5ccf037fa52d 100644 (file)
@@ -135,6 +135,7 @@ start:
                .word   _edata                  @ zImage end address
  THUMB(                .thumb                  )
 1:
+ ARM_BE8(      setend  be )                    @ go BE8 if compiled for BE8
                mrs     r9, cpsr
 #ifdef CONFIG_ARM_VIRT_EXT
                bl      __hyp_stub_install      @ get into SVC mode, reversibly
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x0030
-#ifdef CONFIG_CPU_ENDIAN_BE8
-               orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
+ ARM_BE8(      orr     r0, r0, #1 << 25 )      @ big-endian page tables
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
                orr     r0, r0, #1 << 22        @ U (v6 unaligned access model)
                                                @ (needed for ARM1176)
 #ifdef CONFIG_MMU
-#ifdef CONFIG_CPU_ENDIAN_BE8
-               orr     r0, r0, #1 << 25        @ big-endian page tables
-#endif
+ ARM_BE8(      orr     r0, r0, #1 << 25 )      @ big-endian page tables
                mrcne   p15, 0, r6, c2, c0, 2   @ read ttb control reg
                orrne   r0, r0, #1              @ MMU enabled
                movne   r1, #0xfffffffd         @ domain 0 = client
index eaa9cf4705a7179dcb7221aa5f2a29a6c1883b78..4bdc41622c36686df868e2134d6453fe60684b5b 100644 (file)
@@ -16,3 +16,5 @@ obj-$(CONFIG_MCPM)            += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
 AFLAGS_mcpm_head.o             := -march=armv7-a
 AFLAGS_vlock.o                 := -march=armv7-a
 obj-$(CONFIG_TI_PRIV_EDMA)     += edma.o
+obj-$(CONFIG_BL_SWITCHER)      += bL_switcher.o
+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
new file mode 100644 (file)
index 0000000..5774b6e
--- /dev/null
@@ -0,0 +1,822 @@
+/*
+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
+ *
+ * Created by: Nicolas Pitre, March 2012
+ * Copyright:  (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp_plat.h>
+#include <asm/cputype.h>
+#include <asm/suspend.h>
+#include <asm/mcpm.h>
+#include <asm/bL_switcher.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power_cpu_migrate.h>
+
+
+/*
+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
+ * __attribute_const__ and we don't want the compiler to assume any
+ * constness here as the value _does_ change along some code paths.
+ */
+
+static int read_mpidr(void)
+{
+       unsigned int id;
+       asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
+       return id & MPIDR_HWID_BITMASK;
+}
+
+/*
+ * Get a global nanosecond time stamp for tracing.
+ */
+static s64 get_ns(void)
+{
+       struct timespec ts;
+       getnstimeofday(&ts);
+       return timespec_to_ns(&ts);
+}
+
+/*
+ * bL switcher core code.
+ */
+
+static void bL_do_switch(void *_arg)
+{
+       unsigned ib_mpidr, ib_cpu, ib_cluster;
+       long volatile handshake, **handshake_ptr = _arg;
+
+       pr_debug("%s\n", __func__);
+
+       ib_mpidr = cpu_logical_map(smp_processor_id());
+       ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+       ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+       /* Advertise our handshake location */
+       if (handshake_ptr) {
+               handshake = 0;
+               *handshake_ptr = &handshake;
+       } else
+               handshake = -1;
+
+       /*
+        * Our state has been saved at this point.  Let's release our
+        * inbound CPU.
+        */
+       mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
+       sev();
+
+       /*
+        * From this point, we must assume that our counterpart CPU might
+        * have taken over in its parallel world already, as if execution
+        * just returned from cpu_suspend().  It is therefore important to
+        * be very careful not to make any change the other guy is not
+        * expecting.  This is why we need stack isolation.
+        *
+        * Fancy under cover tasks could be performed here.  For now
+        * we have none.
+        */
+
+       /*
+        * Let's wait until our inbound is alive.
+        */
+       while (!handshake) {
+               wfe();
+               smp_mb();
+       }
+
+       /* Let's put ourself down. */
+       mcpm_cpu_power_down();
+
+       /* should never get here */
+       BUG();
+}
+
+/*
+ * Stack isolation.  To ensure 'current' remains valid, we just use another
+ * piece of our thread's stack space which should be fairly lightly used.
+ * The selected area starts just above the thread_info structure located
+ * at the very bottom of the stack, aligned to a cache line, and indexed
+ * with the cluster number.
+ */
+#define STACK_SIZE 512
+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
+static int bL_switchpoint(unsigned long _arg)
+{
+       unsigned int mpidr = read_mpidr();
+       unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+       void *stack = current_thread_info() + 1;
+       stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
+       stack += clusterid * STACK_SIZE + STACK_SIZE;
+       call_with_stack(bL_do_switch, (void *)_arg, stack);
+       BUG();
+}
+
+/*
+ * Generic switcher interface
+ */
+
+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
+static int bL_switcher_cpu_pairing[NR_CPUS];
+
+/*
+ * bL_switch_to - Switch to a specific cluster for the current CPU
+ * @new_cluster_id: the ID of the cluster to switch to.
+ *
+ * This function must be called on the CPU to be switched.
+ * Returns 0 on success, else a negative status code.
+ */
+static int bL_switch_to(unsigned int new_cluster_id)
+{
+       unsigned int mpidr, this_cpu, that_cpu;
+       unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
+       struct completion inbound_alive;
+       struct tick_device *tdev;
+       enum clock_event_mode tdev_mode;
+       long volatile *handshake_ptr;
+       int ipi_nr, ret;
+
+       this_cpu = smp_processor_id();
+       ob_mpidr = read_mpidr();
+       ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
+       ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
+       BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
+
+       if (new_cluster_id == ob_cluster)
+               return 0;
+
+       that_cpu = bL_switcher_cpu_pairing[this_cpu];
+       ib_mpidr = cpu_logical_map(that_cpu);
+       ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
+       ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
+
+       pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
+                this_cpu, ob_mpidr, ib_mpidr);
+
+       this_cpu = smp_processor_id();
+
+       /* Close the gate for our entry vectors */
+       mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
+       mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
+
+       /* Install our "inbound alive" notifier. */
+       init_completion(&inbound_alive);
+       ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
+       ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
+       mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
+
+       /*
+        * Let's wake up the inbound CPU now in case it requires some delay
+        * to come online, but leave it gated in our entry vector code.
+        */
+       ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
+       if (ret) {
+               pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
+               return ret;
+       }
+
+       /*
+        * Raise a SGI on the inbound CPU to make sure it doesn't stall
+        * in a possible WFI, such as in bL_power_down().
+        */
+       gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
+
+       /*
+        * Wait for the inbound to come up.  This allows for other
+        * tasks to be scheduled in the mean time.
+        */
+       wait_for_completion(&inbound_alive);
+       mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
+
+       /*
+        * From this point we are entering the switch critical zone
+        * and can't take any interrupts anymore.
+        */
+       local_irq_disable();
+       local_fiq_disable();
+       trace_cpu_migrate_begin(get_ns(), ob_mpidr);
+
+       /* redirect GIC's SGIs to our counterpart */
+       gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
+
+       tdev = tick_get_device(this_cpu);
+       if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
+               tdev = NULL;
+       if (tdev) {
+               tdev_mode = tdev->evtdev->mode;
+               clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+       }
+
+       ret = cpu_pm_enter();
+
+       /* we can not tolerate errors at this point */
+       if (ret)
+               panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
+
+       /* Swap the physical CPUs in the logical map for this logical CPU. */
+       cpu_logical_map(this_cpu) = ib_mpidr;
+       cpu_logical_map(that_cpu) = ob_mpidr;
+
+       /* Let's do the actual CPU switch. */
+       ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
+       if (ret > 0)
+               panic("%s: cpu_suspend() returned %d\n", __func__, ret);
+
+       /* We are executing on the inbound CPU at this point */
+       mpidr = read_mpidr();
+       pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
+       BUG_ON(mpidr != ib_mpidr);
+
+       mcpm_cpu_powered_up();
+
+       ret = cpu_pm_exit();
+
+       if (tdev) {
+               clockevents_set_mode(tdev->evtdev, tdev_mode);
+               clockevents_program_event(tdev->evtdev,
+                                         tdev->evtdev->next_event, 1);
+       }
+
+       trace_cpu_migrate_finish(get_ns(), ib_mpidr);
+       local_fiq_enable();
+       local_irq_enable();
+
+       *handshake_ptr = 1;
+       dsb_sev();
+
+       if (ret)
+               pr_err("%s exiting with error %d\n", __func__, ret);
+       return ret;
+}
+
+struct bL_thread {
+       spinlock_t lock;
+       struct task_struct *task;
+       wait_queue_head_t wq;
+       int wanted_cluster;
+       struct completion started;
+       bL_switch_completion_handler completer;
+       void *completer_cookie;
+};
+
+static struct bL_thread bL_threads[NR_CPUS];
+
+static int bL_switcher_thread(void *arg)
+{
+       struct bL_thread *t = arg;
+       struct sched_param param = { .sched_priority = 1 };
+       int cluster;
+       bL_switch_completion_handler completer;
+       void *completer_cookie;
+
+       sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+       complete(&t->started);
+
+       do {
+               if (signal_pending(current))
+                       flush_signals(current);
+               wait_event_interruptible(t->wq,
+                               t->wanted_cluster != -1 ||
+                               kthread_should_stop());
+
+               spin_lock(&t->lock);
+               cluster = t->wanted_cluster;
+               completer = t->completer;
+               completer_cookie = t->completer_cookie;
+               t->wanted_cluster = -1;
+               t->completer = NULL;
+               spin_unlock(&t->lock);
+
+               if (cluster != -1) {
+                       bL_switch_to(cluster);
+
+                       if (completer)
+                               completer(completer_cookie);
+               }
+       } while (!kthread_should_stop());
+
+       return 0;
+}
+
+static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
+{
+       struct task_struct *task;
+
+       task = kthread_create_on_node(bL_switcher_thread, arg,
+                                     cpu_to_node(cpu), "kswitcher_%d", cpu);
+       if (!IS_ERR(task)) {
+               kthread_bind(task, cpu);
+               wake_up_process(task);
+       } else
+               pr_err("%s failed for CPU %d\n", __func__, cpu);
+       return task;
+}
+
+/*
+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
+ *      with completion notification via a callback
+ *
+ * @cpu: the CPU to switch
+ * @new_cluster_id: the ID of the cluster to switch to.
+ * @completer: switch completion callback.  if non-NULL,
+ *     @completer(@completer_cookie) will be called on completion of
+ *     the switch, in non-atomic context.
+ * @completer_cookie: opaque context argument for @completer.
+ *
+ * This function causes a cluster switch on the given CPU by waking up
+ * the appropriate switcher thread.  This function may or may not return
+ * before the switch has occurred.
+ *
+ * If a @completer callback function is supplied, it will be called when
+ * the switch is complete.  This can be used to determine asynchronously
+ * when the switch is complete, regardless of when bL_switch_request()
+ * returns.  When @completer is supplied, no new switch request is permitted
+ * for the affected CPU until after the switch is complete, and @completer
+ * has returned.
+ */
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                        bL_switch_completion_handler completer,
+                        void *completer_cookie)
+{
+       struct bL_thread *t;
+
+       if (cpu >= ARRAY_SIZE(bL_threads)) {
+               pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
+               return -EINVAL;
+       }
+
+       t = &bL_threads[cpu];
+
+       if (IS_ERR(t->task))
+               return PTR_ERR(t->task);
+       if (!t->task)
+               return -ESRCH;
+
+       spin_lock(&t->lock);
+       if (t->completer) {
+               spin_unlock(&t->lock);
+               return -EBUSY;
+       }
+       t->completer = completer;
+       t->completer_cookie = completer_cookie;
+       t->wanted_cluster = new_cluster_id;
+       spin_unlock(&t->lock);
+       wake_up(&t->wq);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
+
+/*
+ * Activation and configuration code.
+ */
+
+static DEFINE_MUTEX(bL_switcher_activation_lock);
+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
+static unsigned int bL_switcher_active;
+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
+static cpumask_t bL_switcher_removed_logical_cpus;
+
+int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
+
+int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
+
+static int bL_activation_notify(unsigned long val)
+{
+       int ret;
+
+       ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
+       if (ret & NOTIFY_STOP_MASK)
+               pr_err("%s: notifier chain failed with status 0x%x\n",
+                       __func__, ret);
+       return notifier_to_errno(ret);
+}
+
+static void bL_switcher_restore_cpus(void)
+{
+       int i;
+
+       for_each_cpu(i, &bL_switcher_removed_logical_cpus)
+               cpu_up(i);
+}
+
+static int bL_switcher_halve_cpus(void)
+{
+       int i, j, cluster_0, gic_id, ret;
+       unsigned int cpu, cluster, mask;
+       cpumask_t available_cpus;
+
+       /* First pass to validate what we have */
+       mask = 0;
+       for_each_online_cpu(i) {
+               cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+               if (cluster >= 2) {
+                       pr_err("%s: only dual cluster systems are supported\n", __func__);
+                       return -EINVAL;
+               }
+               if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
+                       return -EINVAL;
+               mask |= (1 << cluster);
+       }
+       if (mask != 3) {
+               pr_err("%s: no CPU pairing possible\n", __func__);
+               return -EINVAL;
+       }
+
+       /*
+        * Now let's do the pairing.  We match each CPU with another CPU
+        * from a different cluster.  To get a uniform scheduling behavior
+        * without fiddling with CPU topology and compute capacity data,
+        * we'll use logical CPUs initially belonging to the same cluster.
+        */
+       memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
+       cpumask_copy(&available_cpus, cpu_online_mask);
+       cluster_0 = -1;
+       for_each_cpu(i, &available_cpus) {
+               int match = -1;
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+               if (cluster_0 == -1)
+                       cluster_0 = cluster;
+               if (cluster != cluster_0)
+                       continue;
+               cpumask_clear_cpu(i, &available_cpus);
+               for_each_cpu(j, &available_cpus) {
+                       cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
+                       /*
+                        * Let's remember the last match to create "odd"
+                        * pairings on purpose in order for other code not
+                        * to assume any relation between physical and
+                        * logical CPU numbers.
+                        */
+                       if (cluster != cluster_0)
+                               match = j;
+               }
+               if (match != -1) {
+                       bL_switcher_cpu_pairing[i] = match;
+                       cpumask_clear_cpu(match, &available_cpus);
+                       pr_info("CPU%d paired with CPU%d\n", i, match);
+               }
+       }
+
+       /*
+        * Now we disable the unwanted CPUs i.e. everything that has no
+        * pairing information (that includes the pairing counterparts).
+        */
+       cpumask_clear(&bL_switcher_removed_logical_cpus);
+       for_each_online_cpu(i) {
+               cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
+
+               /* Let's take note of the GIC ID for this CPU */
+               gic_id = gic_get_cpu_id(i);
+               if (gic_id < 0) {
+                       pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
+                       bL_switcher_restore_cpus();
+                       return -EINVAL;
+               }
+               bL_gic_id[cpu][cluster] = gic_id;
+               pr_info("GIC ID for CPU %u cluster %u is %u\n",
+                       cpu, cluster, gic_id);
+
+               if (bL_switcher_cpu_pairing[i] != -1) {
+                       bL_switcher_cpu_original_cluster[i] = cluster;
+                       continue;
+               }
+
+               ret = cpu_down(i);
+               if (ret) {
+                       bL_switcher_restore_cpus();
+                       return ret;
+               }
+               cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
+       }
+
+       return 0;
+}
+
+/* Determine the logical CPU a given physical CPU is grouped on. */
+int bL_switcher_get_logical_index(u32 mpidr)
+{
+       int cpu;
+
+       if (!bL_switcher_active)
+               return -EUNATCH;
+
+       mpidr &= MPIDR_HWID_BITMASK;
+       for_each_online_cpu(cpu) {
+               int pairing = bL_switcher_cpu_pairing[cpu];
+               if (pairing == -1)
+                       continue;
+               if ((mpidr == cpu_logical_map(cpu)) ||
+                   (mpidr == cpu_logical_map(pairing)))
+                       return cpu;
+       }
+       return -EINVAL;
+}
+
+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
+{
+       trace_cpu_migrate_current(get_ns(), read_mpidr());
+}
+
+int bL_switcher_trace_trigger(void)
+{
+       int ret;
+
+       preempt_disable();
+
+       bL_switcher_trace_trigger_cpu(NULL);
+       ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
+
+       preempt_enable();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
+
+static int bL_switcher_enable(void)
+{
+       int cpu, ret;
+
+       mutex_lock(&bL_switcher_activation_lock);
+       lock_device_hotplug();
+       if (bL_switcher_active) {
+               unlock_device_hotplug();
+               mutex_unlock(&bL_switcher_activation_lock);
+               return 0;
+       }
+
+       pr_info("big.LITTLE switcher initializing\n");
+
+       ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
+       if (ret)
+               goto error;
+
+       ret = bL_switcher_halve_cpus();
+       if (ret)
+               goto error;
+
+       bL_switcher_trace_trigger();
+
+       for_each_online_cpu(cpu) {
+               struct bL_thread *t = &bL_threads[cpu];
+               spin_lock_init(&t->lock);
+               init_waitqueue_head(&t->wq);
+               init_completion(&t->started);
+               t->wanted_cluster = -1;
+               t->task = bL_switcher_thread_create(cpu, t);
+       }
+
+       bL_switcher_active = 1;
+       bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+       pr_info("big.LITTLE switcher initialized\n");
+       goto out;
+
+error:
+       pr_warn("big.LITTLE switcher initialization failed\n");
+       bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+       unlock_device_hotplug();
+       mutex_unlock(&bL_switcher_activation_lock);
+       return ret;
+}
+
+#ifdef CONFIG_SYSFS
+
+static void bL_switcher_disable(void)
+{
+       unsigned int cpu, cluster;
+       struct bL_thread *t;
+       struct task_struct *task;
+
+       mutex_lock(&bL_switcher_activation_lock);
+       lock_device_hotplug();
+
+       if (!bL_switcher_active)
+               goto out;
+
+       if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
+               bL_activation_notify(BL_NOTIFY_POST_ENABLE);
+               goto out;
+       }
+
+       bL_switcher_active = 0;
+
+       /*
+        * To deactivate the switcher, we must shut down the switcher
+        * threads to prevent any other requests from being accepted.
+        * Then, if the final cluster for given logical CPU is not the
+        * same as the original one, we'll recreate a switcher thread
+        * just for the purpose of switching the CPU back without any
+        * possibility for interference from external requests.
+        */
+       for_each_online_cpu(cpu) {
+               t = &bL_threads[cpu];
+               task = t->task;
+               t->task = NULL;
+               if (!task || IS_ERR(task))
+                       continue;
+               kthread_stop(task);
+               /* no more switch may happen on this CPU at this point */
+               cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+               if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                       continue;
+               init_completion(&t->started);
+               t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
+               task = bL_switcher_thread_create(cpu, t);
+               if (!IS_ERR(task)) {
+                       wait_for_completion(&t->started);
+                       kthread_stop(task);
+                       cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
+                       if (cluster == bL_switcher_cpu_original_cluster[cpu])
+                               continue;
+               }
+               /* If execution gets here, we're in trouble. */
+               pr_crit("%s: unable to restore original cluster for CPU %d\n",
+                       __func__, cpu);
+               pr_crit("%s: CPU %d can't be restored\n",
+                       __func__, bL_switcher_cpu_pairing[cpu]);
+               cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
+                                 &bL_switcher_removed_logical_cpus);
+       }
+
+       bL_switcher_restore_cpus();
+       bL_switcher_trace_trigger();
+
+       bL_activation_notify(BL_NOTIFY_POST_DISABLE);
+
+out:
+       unlock_device_hotplug();
+       mutex_unlock(&bL_switcher_activation_lock);
+}
+
+static ssize_t bL_switcher_active_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%u\n", bL_switcher_active);
+}
+
+static ssize_t bL_switcher_active_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       int ret;
+
+       switch (buf[0]) {
+       case '0':
+               bL_switcher_disable();
+               ret = 0;
+               break;
+       case '1':
+               ret = bL_switcher_enable();
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return (ret >= 0) ? count : ret;
+}
+
+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t count)
+{
+       int ret = bL_switcher_trace_trigger();
+
+       return ret ? ret : count;
+}
+
+static struct kobj_attribute bL_switcher_active_attr =
+       __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
+
+static struct kobj_attribute bL_switcher_trace_trigger_attr =
+       __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
+
+static struct attribute *bL_switcher_attrs[] = {
+       &bL_switcher_active_attr.attr,
+       &bL_switcher_trace_trigger_attr.attr,
+       NULL,
+};
+
+static struct attribute_group bL_switcher_attr_group = {
+       .attrs = bL_switcher_attrs,
+};
+
+static struct kobject *bL_switcher_kobj;
+
+static int __init bL_switcher_sysfs_init(void)
+{
+       int ret;
+
+       bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
+       if (!bL_switcher_kobj)
+               return -ENOMEM;
+       ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
+       if (ret)
+               kobject_put(bL_switcher_kobj);
+       return ret;
+}
+
+#endif  /* CONFIG_SYSFS */
+
+bool bL_switcher_get_enabled(void)
+{
+       mutex_lock(&bL_switcher_activation_lock);
+
+       return bL_switcher_active;
+}
+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
+
+void bL_switcher_put_enabled(void)
+{
+       mutex_unlock(&bL_switcher_activation_lock);
+}
+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
+
+/*
+ * Veto any CPU hotplug operation on those CPUs we've removed
+ * while the switcher is active.
+ * We're just not ready to deal with that given the trickery involved.
+ */
+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       if (bL_switcher_active) {
+               int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
+               switch (action & 0xf) {
+               case CPU_UP_PREPARE:
+               case CPU_DOWN_PREPARE:
+                       if (pairing == -1)
+                               return NOTIFY_BAD;
+               }
+       }
+       return NOTIFY_DONE;
+}
+
+static bool no_bL_switcher;
+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
+
+static int __init bL_switcher_init(void)
+{
+       int ret;
+
+       if (MAX_NR_CLUSTERS != 2) {
+               pr_err("%s: only dual cluster systems are supported\n", __func__);
+               return -EINVAL;
+       }
+
+       cpu_notifier(bL_switcher_hotplug_callback, 0);
+
+       if (!no_bL_switcher) {
+               ret = bL_switcher_enable();
+               if (ret)
+                       return ret;
+       }
+
+#ifdef CONFIG_SYSFS
+       ret = bL_switcher_sysfs_init();
+       if (ret)
+               pr_err("%s: unable to create sysfs entry\n", __func__);
+#endif
+
+       return 0;
+}
+
+late_initcall(bL_switcher_init);
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c
new file mode 100644 (file)
index 0000000..3f47f12
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
+ *
+ * Created by: Nicolas Pitre, November 2012
+ * Copyright:  (C) 2012-2013  Linaro Limited
+ *
+ * Dummy interface to user space for debugging purpose only.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <asm/bL_switcher.h>
+
+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
+                       size_t len, loff_t *pos)
+{
+       unsigned char val[3];
+       unsigned int cpu, cluster;
+       int ret;
+
+       pr_debug("%s\n", __func__);
+
+       if (len < 3)
+               return -EINVAL;
+
+       if (copy_from_user(val, buf, 3))
+               return -EFAULT;
+
+       /* format: <cpu#>,<cluster#> */
+       if (val[0] < '0' || val[0] > '9' ||
+           val[1] != ',' ||
+           val[2] < '0' || val[2] > '1')
+               return -EINVAL;
+
+       cpu = val[0] - '0';
+       cluster = val[2] - '0';
+       ret = bL_switch_request(cpu, cluster);
+
+       return ret ? : len;
+}
+
+static const struct file_operations bL_switcher_fops = {
+       .write          = bL_switcher_write,
+       .owner  = THIS_MODULE,
+};
+
+static struct miscdevice bL_switcher_device = {
+       MISC_DYNAMIC_MINOR,
+       "b.L_switcher",
+       &bL_switcher_fops
+};
+
+static int __init bL_switcher_dummy_if_init(void)
+{
+       return misc_register(&bL_switcher_device);
+}
+
+static void __exit bL_switcher_dummy_if_exit(void)
+{
+       misc_deregister(&bL_switcher_device);
+}
+
+module_init(bL_switcher_dummy_if_init);
+module_exit(bL_switcher_dummy_if_exit);
index 990250965f2cfb4e4e3a984678fcf62eedbcdb8d..26020a03f659f2d78fe37f2dca7f3e0a0eb573ca 100644 (file)
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
        sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
 }
 
+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
+
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                        unsigned long poke_phys_addr, unsigned long poke_val)
+{
+       unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
+       poke[0] = poke_phys_addr;
+       poke[1] = poke_val;
+       __cpuc_flush_dcache_area((void *)poke, 8);
+       outer_clean_range(__pa(poke), __pa(poke + 2));
+}
+
 static const struct mcpm_platform_ops *platform_ops;
 
 int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
@@ -90,6 +102,21 @@ void mcpm_cpu_power_down(void)
        BUG();
 }
 
+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster)
+{
+       int ret;
+
+       if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish))
+               return -EUNATCH;
+
+       ret = platform_ops->power_down_finish(cpu, cluster);
+       if (ret)
+               pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n",
+                       __func__, cpu, cluster, ret);
+
+       return ret;
+}
+
 void mcpm_cpu_suspend(u64 expected_residency)
 {
        phys_reset_t phys_reset;
index 39c96df3477a41549d71e1a18733dd85f4a168df..e02db4b81a66942d307cced79e0cd6c6ae9733b5 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <linux/linkage.h>
 #include <asm/mcpm.h>
+#include <asm/assembler.h>
 
 #include "vlock.h"
 
@@ -47,6 +48,7 @@
 
 ENTRY(mcpm_entry_point)
 
+ ARM_BE8(setend        be)
  THUMB(        adr     r12, BSYM(1f)   )
  THUMB(        bx      r12             )
  THUMB(        .thumb                  )
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
         * position independent way.
         */
        adr     r5, 3f
-       ldmia   r5, {r6, r7, r8, r11}
+       ldmia   r5, {r0, r6, r7, r8, r11}
+       add     r0, r5, r0                      @ r0 = mcpm_entry_early_pokes
        add     r6, r5, r6                      @ r6 = mcpm_entry_vectors
        ldr     r7, [r5, r7]                    @ r7 = mcpm_power_up_setup_phys
        add     r8, r5, r8                      @ r8 = mcpm_sync
        add     r11, r5, r11                    @ r11 = first_man_locks
 
+       @ Perform an early poke, if any
+       add     r0, r0, r4, lsl #3
+       ldmia   r0, {r0, r1}
+       teq     r0, #0
+       strne   r1, [r0]
+
        mov     r0, #MCPM_SYNC_CLUSTER_SIZE
        mla     r8, r0, r10, r8                 @ r8 = sync cluster base
 
@@ -195,7 +204,8 @@ mcpm_entry_gated:
 
        .align  2
 
-3:     .word   mcpm_entry_vectors - .
+3:     .word   mcpm_entry_early_pokes - .
+       .word   mcpm_entry_vectors - 3b
        .word   mcpm_power_up_setup_phys - 3b
        .word   mcpm_sync - 3b
        .word   first_man_locks - 3b
@@ -214,6 +224,10 @@ first_man_locks:
 ENTRY(mcpm_entry_vectors)
        .space  4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
 
+       .type   mcpm_entry_early_pokes, #object
+ENTRY(mcpm_entry_early_pokes)
+       .space  8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
        .type   mcpm_power_up_setup_phys, #object
 ENTRY(mcpm_power_up_setup_phys)
        .space  4               @ set by mcpm_sync_init()
index 1bc34c7567fdf890a5eea685c04d70f7de89ffbf..177251a4dd9aff7a8ba616d1ed04796e5c49cb88 100644 (file)
 #include <asm/smp.h>
 #include <asm/smp_plat.h>
 
+static void cpu_to_pcpu(unsigned int cpu,
+                       unsigned int *pcpu, unsigned int *pcluster)
+{
+       unsigned int mpidr;
+
+       mpidr = cpu_logical_map(cpu);
+       *pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+       *pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+}
+
 static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-       unsigned int mpidr, pcpu, pcluster, ret;
+       unsigned int pcpu, pcluster, ret;
        extern void secondary_startup(void);
 
-       mpidr = cpu_logical_map(cpu);
-       pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
-       pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+       cpu_to_pcpu(cpu, &pcpu, &pcluster);
+
        pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n",
                 __func__, cpu, pcpu, pcluster);
 
@@ -47,6 +56,15 @@ static void mcpm_secondary_init(unsigned int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+static int mcpm_cpu_kill(unsigned int cpu)
+{
+       unsigned int pcpu, pcluster;
+
+       cpu_to_pcpu(cpu, &pcpu, &pcluster);
+
+       return !mcpm_cpu_power_down_finish(pcpu, pcluster);
+}
+
 static int mcpm_cpu_disable(unsigned int cpu)
 {
        /*
@@ -73,6 +91,7 @@ static struct smp_operations __initdata mcpm_smp_ops = {
        .smp_boot_secondary     = mcpm_boot_secondary,
        .smp_secondary_init     = mcpm_secondary_init,
 #ifdef CONFIG_HOTPLUG_CPU
+       .cpu_kill               = mcpm_cpu_kill,
        .cpu_disable            = mcpm_cpu_disable,
        .cpu_die                = mcpm_cpu_die,
 #endif
index e901d0f3e0bbcd735f5cf7e62bd653536ff8aa71..ce922d0ea7aa85daa59c408ac5cd79beab5459a6 100644 (file)
@@ -175,7 +175,7 @@ static struct clock_event_device sp804_clockevent = {
 
 static struct irqaction sp804_timer_irq = {
        .name           = "timer",
-       .flags          = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
+       .flags          = IRQF_TIMER | IRQF_IRQPOLL,
        .handler        = sp804_timer_interrupt,
        .dev_id         = &sp804_clockevent,
 };
index 317960f1248893d6200650ec9c01e8132b627c16..0142ec37e0be26d8c1480aa4929a722896b3b397 100644 (file)
@@ -1,5 +1,6 @@
-CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_MODULES=y
@@ -11,11 +12,11 @@ CONFIG_ARCH_SA1100=y
 CONFIG_SA1100_H3600=y
 CONFIG_PCCARD=y
 CONFIG_PCMCIA_SA1100=y
+CONFIG_PREEMPT=y
 CONFIG_ZBOOT_ROM_TEXT=0x0
 CONFIG_ZBOOT_ROM_BSS=0x0
 # CONFIG_CPU_FREQ_STAT is not set
 CONFIG_FPE_NWFPE=y
-CONFIG_PM=y
 CONFIG_NET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
@@ -24,13 +25,10 @@ CONFIG_IRDA=m
 CONFIG_IRLAN=m
 CONFIG_IRNET=m
 CONFIG_IRCOMM=m
-CONFIG_SA1100_FIR=m
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
 CONFIG_MTD_REDBOOT_PARTS=y
-CONFIG_MTD_CHAR=y
 CONFIG_MTD_BLOCK=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_ADV_OPTIONS=y
@@ -41,19 +39,15 @@ CONFIG_MTD_SA1100=y
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
-# CONFIG_MISC_DEVICES is not set
 CONFIG_IDE=y
 CONFIG_BLK_DEV_IDECS=y
 CONFIG_NETDEVICES=y
-# CONFIG_NETDEV_1000 is not set
-# CONFIG_NETDEV_10000 is not set
-# CONFIG_WLAN is not set
-CONFIG_NET_PCMCIA=y
 CONFIG_PCMCIA_PCNET=y
 CONFIG_PPP=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_DEFLATE=m
 CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_ASYNC=m
+# CONFIG_WLAN is not set
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GPIO=y
 # CONFIG_INPUT_MOUSE is not set
@@ -64,8 +58,6 @@ CONFIG_SERIAL_SA1100_CONSOLE=y
 # CONFIG_HWMON is not set
 CONFIG_FB=y
 CONFIG_FB_SA1100=y
-# CONFIG_VGA_CONSOLE is not set
-# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_EXT2_FS=y
 CONFIG_MSDOS_FS=m
@@ -74,6 +66,4 @@ CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=m
 CONFIG_NFS_FS=y
 CONFIG_NFSD=m
-CONFIG_SMB_FS=m
 CONFIG_NLS=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
new file mode 100644 (file)
index 0000000..6231d36
--- /dev/null
@@ -0,0 +1 @@
+aesbs-core.S
index a2c83851bc90a29f5f1d06415cb4a0db4dd726e1..81cda39860c5c7ad90a6710727011ec79296e5d8 100644 (file)
@@ -3,7 +3,17 @@
 #
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 
-aes-arm-y  := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-y      := aes-armv4.o aes_glue.o
+aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
+sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+       $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S
index 59f7877ead6ac9ee3f8a31b43c6e0458de26cd8f..3003fa1f6fb4b9395c77340fbf83011b5cb0e419 100644 (file)
@@ -6,22 +6,12 @@
 #include <linux/crypto.h>
 #include <crypto/aes.h>
 
-#define AES_MAXNR 14
+#include "aes_glue.h"
 
-typedef struct {
-       unsigned int rd_key[4 *(AES_MAXNR + 1)];
-       int rounds;
-} AES_KEY;
-
-struct AES_CTX {
-       AES_KEY enc_key;
-       AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+EXPORT_SYMBOL(AES_encrypt);
+EXPORT_SYMBOL(AES_decrypt);
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
 
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
                .cipher = {
                        .cia_min_keysize        = AES_MIN_KEY_SIZE,
                        .cia_max_keysize        = AES_MAX_KEY_SIZE,
-                       .cia_setkey                     = aes_set_key,
+                       .cia_setkey             = aes_set_key,
                        .cia_encrypt            = aes_encrypt,
                        .cia_decrypt            = aes_decrypt
                }
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
new file mode 100644 (file)
index 0000000..cca3e51
--- /dev/null
@@ -0,0 +1,19 @@
+
+#define AES_MAXNR 14
+
+struct AES_KEY {
+       unsigned int rd_key[4 * (AES_MAXNR + 1)];
+       int rounds;
+};
+
+struct AES_CTX {
+       struct AES_KEY enc_key;
+       struct AES_KEY dec_key;
+};
+
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+                                          const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+                                          const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
new file mode 100644 (file)
index 0000000..64205d4
--- /dev/null
@@ -0,0 +1,2544 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt      19.5 cycles per byte processed with 128-bit key
+@ decrypt      22.1 cycles per byte processed with 128-bit key
+@ key conv.    440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@                                              <appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@                                      <ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax        unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu   neon
+
+.type  _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+       adr     r6,_bsaes_decrypt8
+       vldmia  r4!, {q9}               @ round 0 key
+       add     r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+       vldmia  r6!, {q8}               @ .LM0ISR
+       veor    q10, q0, q9     @ xor with round0 key
+       veor    q11, q1, q9
+        vtbl.8 d0, {q10}, d16
+        vtbl.8 d1, {q10}, d17
+       veor    q12, q2, q9
+        vtbl.8 d2, {q11}, d16
+        vtbl.8 d3, {q11}, d17
+       veor    q13, q3, q9
+        vtbl.8 d4, {q12}, d16
+        vtbl.8 d5, {q12}, d17
+       veor    q14, q4, q9
+        vtbl.8 d6, {q13}, d16
+        vtbl.8 d7, {q13}, d17
+       veor    q15, q5, q9
+        vtbl.8 d8, {q14}, d16
+        vtbl.8 d9, {q14}, d17
+       veor    q10, q6, q9
+        vtbl.8 d10, {q15}, d16
+        vtbl.8 d11, {q15}, d17
+       veor    q11, q7, q9
+        vtbl.8 d12, {q10}, d16
+        vtbl.8 d13, {q10}, d17
+        vtbl.8 d14, {q11}, d16
+        vtbl.8 d15, {q11}, d17
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q4, #1
+       veor            q10, q10, q7
+        veor           q11, q11, q5
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #1
+        veor           q5, q5, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q3
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q5, #2
+        vshr.u64       q11, q4, #2
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q5, q5, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q3
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q3, #4
+        vshr.u64       q11, q2, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #4
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q4
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       sub     r5,r5,#1
+       b       .Ldec_sbox
+.align 4
+.Ldec_loop:
+       vldmia  r4!, {q8-q11}
+       veor    q8, q8, q0
+       veor    q9, q9, q1
+       vtbl.8  d0, {q8}, d24
+       vtbl.8  d1, {q8}, d25
+       vldmia  r4!, {q8}
+       veor    q10, q10, q2
+       vtbl.8  d2, {q9}, d24
+       vtbl.8  d3, {q9}, d25
+       vldmia  r4!, {q9}
+       veor    q11, q11, q3
+       vtbl.8  d4, {q10}, d24
+       vtbl.8  d5, {q10}, d25
+       vldmia  r4!, {q10}
+       vtbl.8  d6, {q11}, d24
+       vtbl.8  d7, {q11}, d25
+       vldmia  r4!, {q11}
+       veor    q8, q8, q4
+       veor    q9, q9, q5
+       vtbl.8  d8, {q8}, d24
+       vtbl.8  d9, {q8}, d25
+       veor    q10, q10, q6
+       vtbl.8  d10, {q9}, d24
+       vtbl.8  d11, {q9}, d25
+       veor    q11, q11, q7
+       vtbl.8  d12, {q10}, d24
+       vtbl.8  d13, {q10}, d25
+       vtbl.8  d14, {q11}, d24
+       vtbl.8  d15, {q11}, d25
+.Ldec_sbox:
+        veor   q1, q1, q4
+       veor    q3, q3, q4
+
+       veor    q4, q4, q7
+        veor   q1, q1, q6
+       veor    q2, q2, q7
+       veor    q6, q6, q4
+
+       veor    q0, q0, q1
+       veor    q2, q2, q5
+        veor   q7, q7, q6
+       veor    q3, q3, q0
+       veor    q5, q5, q0
+       veor    q1, q1, q3
+       veor    q11, q3, q0
+       veor    q10, q7, q4
+       veor    q9, q1, q6
+       veor    q13, q4, q0
+        vmov   q8, q10
+       veor    q12, q5, q2
+
+       vorr    q10, q10, q9
+       veor    q15, q11, q8
+       vand    q14, q11, q12
+       vorr    q11, q11, q12
+       veor    q12, q12, q9
+       vand    q8, q8, q9
+       veor    q9, q6, q2
+       vand    q15, q15, q12
+       vand    q13, q13, q9
+       veor    q9, q3, q7
+       veor    q12, q1, q5
+       veor    q11, q11, q13
+       veor    q10, q10, q13
+       vand    q13, q9, q12
+       vorr    q9, q9, q12
+       veor    q11, q11, q15
+       veor    q8, q8, q13
+       veor    q10, q10, q14
+       veor    q9, q9, q15
+       veor    q8, q8, q14
+       vand    q12, q4, q6
+       veor    q9, q9, q14
+       vand    q13, q0, q2
+       vand    q14, q7, q1
+       vorr    q15, q3, q5
+       veor    q11, q11, q12
+       veor    q9, q9, q14
+       veor    q8, q8, q15
+       veor    q10, q10, q13
+
+       @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+
+       @ new smaller inversion
+
+       vand    q14, q11, q9
+       vmov    q12, q8
+
+       veor    q13, q10, q14
+       veor    q15, q8, q14
+       veor    q14, q8, q14    @ q14=q15
+
+       vbsl    q13, q9, q8
+       vbsl    q15, q11, q10
+       veor    q11, q11, q10
+
+       vbsl    q12, q13, q14
+       vbsl    q8, q14, q13
+
+       vand    q14, q12, q15
+       veor    q9, q9, q8
+
+       veor    q14, q14, q11
+       veor    q12, q5, q2
+       veor    q8, q1, q6
+       veor    q10, q15, q14
+       vand    q10, q10, q5
+       veor    q5, q5, q1
+       vand    q11, q1, q15
+       vand    q5, q5, q14
+       veor    q1, q11, q10
+       veor    q5, q5, q11
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q2
+       veor    q12, q12, q8
+        veor   q2, q2, q6
+       vand    q8, q8, q15
+        vand   q6, q6, q13
+       vand    q12, q12, q14
+        vand   q2, q2, q9
+       veor    q8, q8, q12
+        veor   q2, q2, q6
+       veor    q12, q12, q11
+        veor   q6, q6, q10
+       veor    q5, q5, q12
+       veor    q2, q2, q12
+       veor    q1, q1, q8
+       veor    q6, q6, q8
+
+       veor    q12, q3, q0
+       veor    q8, q7, q4
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q0
+       veor    q12, q12, q8
+        veor   q0, q0, q4
+       vand    q8, q8, q15
+        vand   q4, q4, q13
+       vand    q12, q12, q14
+        vand   q0, q0, q9
+       veor    q8, q8, q12
+        veor   q0, q0, q4
+       veor    q12, q12, q11
+        veor   q4, q4, q10
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q10, q15, q14
+       vand    q10, q10, q3
+       veor    q3, q3, q7
+       vand    q11, q7, q15
+       vand    q3, q3, q14
+       veor    q7, q11, q10
+       veor    q3, q3, q11
+       veor    q3, q3, q12
+       veor    q0, q0, q12
+       veor    q7, q7, q8
+       veor    q4, q4, q8
+       veor    q1, q1, q7
+       veor    q6, q6, q5
+
+       veor    q4, q4, q1
+       veor    q2, q2, q7
+       veor    q5, q5, q7
+       veor    q4, q4, q2
+        veor   q7, q7, q0
+       veor    q4, q4, q5
+        veor   q3, q3, q6
+        veor   q6, q6, q1
+       veor    q3, q3, q4
+
+       veor    q4, q4, q0
+       veor    q7, q7, q3
+       subs    r5,r5,#1
+       bcc     .Ldec_done
+       @ multiplication by 0x05-0x00-0x04-0x00
+       vext.8  q8, q0, q0, #8
+       vext.8  q14, q3, q3, #8
+       vext.8  q15, q5, q5, #8
+       veor    q8, q8, q0
+       vext.8  q9, q1, q1, #8
+       veor    q14, q14, q3
+       vext.8  q10, q6, q6, #8
+       veor    q15, q15, q5
+       vext.8  q11, q4, q4, #8
+       veor    q9, q9, q1
+       vext.8  q12, q2, q2, #8
+       veor    q10, q10, q6
+       vext.8  q13, q7, q7, #8
+       veor    q11, q11, q4
+       veor    q12, q12, q2
+       veor    q13, q13, q7
+
+        veor   q0, q0, q14
+        veor   q1, q1, q14
+        veor   q6, q6, q8
+        veor   q2, q2, q10
+        veor   q4, q4, q9
+        veor   q1, q1, q15
+        veor   q6, q6, q15
+        veor   q2, q2, q14
+        veor   q7, q7, q11
+        veor   q4, q4, q14
+        veor   q3, q3, q12
+        veor   q2, q2, q15
+        veor   q7, q7, q15
+        veor   q5, q5, q13
+       vext.8  q8, q0, q0, #12 @ x0 <<< 32
+       vext.8  q9, q1, q1, #12
+        veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+       vext.8  q10, q6, q6, #12
+        veor   q1, q1, q9
+       vext.8  q11, q4, q4, #12
+        veor   q6, q6, q10
+       vext.8  q12, q2, q2, #12
+        veor   q4, q4, q11
+       vext.8  q13, q7, q7, #12
+        veor   q2, q2, q12
+       vext.8  q14, q3, q3, #12
+        veor   q7, q7, q13
+       vext.8  q15, q5, q5, #12
+        veor   q3, q3, q14
+
+       veor    q9, q9, q0
+        veor   q5, q5, q15
+        vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    q10, q10, q1
+       veor    q8, q8, q5
+       veor    q9, q9, q5
+        vext.8 q1, q1, q1, #8
+       veor    q13, q13, q2
+        veor   q0, q0, q8
+       veor    q14, q14, q7
+        veor   q1, q1, q9
+        vext.8 q8, q2, q2, #8
+       veor    q12, q12, q4
+        vext.8 q9, q7, q7, #8
+       veor    q15, q15, q3
+        vext.8 q2, q4, q4, #8
+       veor    q11, q11, q6
+        vext.8 q7, q5, q5, #8
+       veor    q12, q12, q5
+        vext.8 q4, q3, q3, #8
+       veor    q11, q11, q5
+        vext.8 q3, q6, q6, #8
+       veor    q5, q9, q13
+       veor    q11, q11, q2
+       veor    q7, q7, q15
+       veor    q6, q4, q14
+       veor    q4, q8, q12
+       veor    q2, q3, q10
+       vmov    q3, q11
+        @ vmov q5, q9
+       vldmia  r6, {q12}               @ .LISR
+       ite     eq                              @ Thumb2 thing, sanity check in ARM
+       addeq   r6,r6,#0x10
+       bne     .Ldec_loop
+       vldmia  r6, {q12}               @ .LISRM0
+       b       .Ldec_loop
+.align 4
+.Ldec_done:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q3, #1
+        vshr.u64       q11, q2, #1
+       veor            q10, q10, q5
+        veor           q11, q11, q7
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #1
+        veor           q7, q7, q11
+        vshl.u64       q11, q11, #1
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q4
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q4, q4, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q7, #2
+        vshr.u64       q11, q2, #2
+       veor            q10, q10, q5
+        veor           q11, q11, q3
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #2
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #2
+       veor            q7, q7, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q4
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q4, q4, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q4, #4
+        vshr.u64       q11, q6, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q3
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #4
+       veor            q4, q4, q10
+        veor           q6, q6, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q2
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vldmia  r4, {q8}                        @ last round key
+       veor    q6, q6, q8
+       veor    q4, q4, q8
+       veor    q2, q2, q8
+       veor    q7, q7, q8
+       veor    q3, q3, q8
+       veor    q5, q5, q8
+       veor    q0, q0, q8
+       veor    q1, q1, q8
+       bx      lr
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type  _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:       @ InvShiftRows constants
+       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:                @ ShiftRows constants
+       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+       .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 6
+.size  _bsaes_const,.-_bsaes_const
+
+.type  _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+       adr     r6,_bsaes_encrypt8
+       vldmia  r4!, {q9}               @ round 0 key
+       sub     r6,r6,#_bsaes_encrypt8-.LM0SR
+
+       vldmia  r6!, {q8}               @ .LM0SR
+_bsaes_encrypt8_alt:
+       veor    q10, q0, q9     @ xor with round0 key
+       veor    q11, q1, q9
+        vtbl.8 d0, {q10}, d16
+        vtbl.8 d1, {q10}, d17
+       veor    q12, q2, q9
+        vtbl.8 d2, {q11}, d16
+        vtbl.8 d3, {q11}, d17
+       veor    q13, q3, q9
+        vtbl.8 d4, {q12}, d16
+        vtbl.8 d5, {q12}, d17
+       veor    q14, q4, q9
+        vtbl.8 d6, {q13}, d16
+        vtbl.8 d7, {q13}, d17
+       veor    q15, q5, q9
+        vtbl.8 d8, {q14}, d16
+        vtbl.8 d9, {q14}, d17
+       veor    q10, q6, q9
+        vtbl.8 d10, {q15}, d16
+        vtbl.8 d11, {q15}, d17
+       veor    q11, q7, q9
+        vtbl.8 d12, {q10}, d16
+        vtbl.8 d13, {q10}, d17
+        vtbl.8 d14, {q11}, d16
+        vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q6, #1
+        vshr.u64       q11, q4, #1
+       veor            q10, q10, q7
+        veor           q11, q11, q5
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #1
+        veor           q5, q5, q11
+        vshl.u64       q11, q11, #1
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q3
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q5, #2
+        vshr.u64       q11, q4, #2
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #2
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #2
+       veor            q5, q5, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q3
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q3, q3, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q3, #4
+        vshr.u64       q11, q2, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q6
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q6, q6, q11
+        vshl.u64       q11, q11, #4
+       veor            q3, q3, q10
+        veor           q2, q2, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q4
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       sub     r5,r5,#1
+       b       .Lenc_sbox
+.align 4
+.Lenc_loop:
+       vldmia  r4!, {q8-q11}
+       veor    q8, q8, q0
+       veor    q9, q9, q1
+       vtbl.8  d0, {q8}, d24
+       vtbl.8  d1, {q8}, d25
+       vldmia  r4!, {q8}
+       veor    q10, q10, q2
+       vtbl.8  d2, {q9}, d24
+       vtbl.8  d3, {q9}, d25
+       vldmia  r4!, {q9}
+       veor    q11, q11, q3
+       vtbl.8  d4, {q10}, d24
+       vtbl.8  d5, {q10}, d25
+       vldmia  r4!, {q10}
+       vtbl.8  d6, {q11}, d24
+       vtbl.8  d7, {q11}, d25
+       vldmia  r4!, {q11}
+       veor    q8, q8, q4
+       veor    q9, q9, q5
+       vtbl.8  d8, {q8}, d24
+       vtbl.8  d9, {q8}, d25
+       veor    q10, q10, q6
+       vtbl.8  d10, {q9}, d24
+       vtbl.8  d11, {q9}, d25
+       veor    q11, q11, q7
+       vtbl.8  d12, {q10}, d24
+       vtbl.8  d13, {q10}, d25
+       vtbl.8  d14, {q11}, d24
+       vtbl.8  d15, {q11}, d25
+.Lenc_sbox:
+       veor    q2, q2, q1
+       veor    q5, q5, q6
+       veor    q3, q3, q0
+       veor    q6, q6, q2
+       veor    q5, q5, q0
+
+       veor    q6, q6, q3
+       veor    q3, q3, q7
+       veor    q7, q7, q5
+       veor    q3, q3, q4
+       veor    q4, q4, q5
+
+       veor    q2, q2, q7
+       veor    q3, q3, q1
+       veor    q1, q1, q5
+       veor    q11, q7, q4
+       veor    q10, q1, q2
+       veor    q9, q5, q3
+       veor    q13, q2, q4
+        vmov   q8, q10
+       veor    q12, q6, q0
+
+       vorr    q10, q10, q9
+       veor    q15, q11, q8
+       vand    q14, q11, q12
+       vorr    q11, q11, q12
+       veor    q12, q12, q9
+       vand    q8, q8, q9
+       veor    q9, q3, q0
+       vand    q15, q15, q12
+       vand    q13, q13, q9
+       veor    q9, q7, q1
+       veor    q12, q5, q6
+       veor    q11, q11, q13
+       veor    q10, q10, q13
+       vand    q13, q9, q12
+       vorr    q9, q9, q12
+       veor    q11, q11, q15
+       veor    q8, q8, q13
+       veor    q10, q10, q14
+       veor    q9, q9, q15
+       veor    q8, q8, q14
+       vand    q12, q2, q3
+       veor    q9, q9, q14
+       vand    q13, q4, q0
+       vand    q14, q1, q5
+       vorr    q15, q7, q6
+       veor    q11, q11, q12
+       veor    q9, q9, q14
+       veor    q8, q8, q15
+       veor    q10, q10, q13
+
+       @ Inv_GF16      0,      1,      2,      3, s0, s1, s2, s3
+
+       @ new smaller inversion
+
+       vand    q14, q11, q9
+       vmov    q12, q8
+
+       veor    q13, q10, q14
+       veor    q15, q8, q14
+       veor    q14, q8, q14    @ q14=q15
+
+       vbsl    q13, q9, q8
+       vbsl    q15, q11, q10
+       veor    q11, q11, q10
+
+       vbsl    q12, q13, q14
+       vbsl    q8, q14, q13
+
+       vand    q14, q12, q15
+       veor    q9, q9, q8
+
+       veor    q14, q14, q11
+       veor    q12, q6, q0
+       veor    q8, q5, q3
+       veor    q10, q15, q14
+       vand    q10, q10, q6
+       veor    q6, q6, q5
+       vand    q11, q5, q15
+       vand    q6, q6, q14
+       veor    q5, q11, q10
+       veor    q6, q6, q11
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q0
+       veor    q12, q12, q8
+        veor   q0, q0, q3
+       vand    q8, q8, q15
+        vand   q3, q3, q13
+       vand    q12, q12, q14
+        vand   q0, q0, q9
+       veor    q8, q8, q12
+        veor   q0, q0, q3
+       veor    q12, q12, q11
+        veor   q3, q3, q10
+       veor    q6, q6, q12
+       veor    q0, q0, q12
+       veor    q5, q5, q8
+       veor    q3, q3, q8
+
+       veor    q12, q7, q4
+       veor    q8, q1, q2
+       veor    q11, q15, q14
+        veor   q10, q13, q9
+       vand    q11, q11, q12
+        vand   q10, q10, q4
+       veor    q12, q12, q8
+        veor   q4, q4, q2
+       vand    q8, q8, q15
+        vand   q2, q2, q13
+       vand    q12, q12, q14
+        vand   q4, q4, q9
+       veor    q8, q8, q12
+        veor   q4, q4, q2
+       veor    q12, q12, q11
+        veor   q2, q2, q10
+       veor    q15, q15, q13
+       veor    q14, q14, q9
+       veor    q10, q15, q14
+       vand    q10, q10, q7
+       veor    q7, q7, q1
+       vand    q11, q1, q15
+       vand    q7, q7, q14
+       veor    q1, q11, q10
+       veor    q7, q7, q11
+       veor    q7, q7, q12
+       veor    q4, q4, q12
+       veor    q1, q1, q8
+       veor    q2, q2, q8
+       veor    q7, q7, q0
+       veor    q1, q1, q6
+       veor    q6, q6, q0
+       veor    q4, q4, q7
+       veor    q0, q0, q1
+
+       veor    q1, q1, q5
+       veor    q5, q5, q2
+       veor    q2, q2, q3
+       veor    q3, q3, q5
+       veor    q4, q4, q5
+
+       veor    q6, q6, q3
+       subs    r5,r5,#1
+       bcc     .Lenc_done
+       vext.8  q8, q0, q0, #12 @ x0 <<< 32
+       vext.8  q9, q1, q1, #12
+        veor   q0, q0, q8              @ x0 ^ (x0 <<< 32)
+       vext.8  q10, q4, q4, #12
+        veor   q1, q1, q9
+       vext.8  q11, q6, q6, #12
+        veor   q4, q4, q10
+       vext.8  q12, q3, q3, #12
+        veor   q6, q6, q11
+       vext.8  q13, q7, q7, #12
+        veor   q3, q3, q12
+       vext.8  q14, q2, q2, #12
+        veor   q7, q7, q13
+       vext.8  q15, q5, q5, #12
+        veor   q2, q2, q14
+
+       veor    q9, q9, q0
+        veor   q5, q5, q15
+        vext.8 q0, q0, q0, #8          @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    q10, q10, q1
+       veor    q8, q8, q5
+       veor    q9, q9, q5
+        vext.8 q1, q1, q1, #8
+       veor    q13, q13, q3
+        veor   q0, q0, q8
+       veor    q14, q14, q7
+        veor   q1, q1, q9
+        vext.8 q8, q3, q3, #8
+       veor    q12, q12, q6
+        vext.8 q9, q7, q7, #8
+       veor    q15, q15, q2
+        vext.8 q3, q6, q6, #8
+       veor    q11, q11, q4
+        vext.8 q7, q5, q5, #8
+       veor    q12, q12, q5
+        vext.8 q6, q2, q2, #8
+       veor    q11, q11, q5
+        vext.8 q2, q4, q4, #8
+       veor    q5, q9, q13
+       veor    q4, q8, q12
+       veor    q3, q3, q11
+       veor    q7, q7, q15
+       veor    q6, q6, q14
+        @ vmov q4, q8
+       veor    q2, q2, q10
+        @ vmov q5, q9
+       vldmia  r6, {q12}               @ .LSR
+       ite     eq                              @ Thumb2 thing, samity check in ARM
+       addeq   r6,r6,#0x10
+       bne     .Lenc_loop
+       vldmia  r6, {q12}               @ .LSRM0
+       b       .Lenc_loop
+.align 4
+.Lenc_done:
+       vmov.i8 q8,#0x55                        @ compose .LBS0
+       vmov.i8 q9,#0x33                        @ compose .LBS1
+       vshr.u64        q10, q2, #1
+        vshr.u64       q11, q3, #1
+       veor            q10, q10, q5
+        veor           q11, q11, q7
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #1
+        veor           q7, q7, q11
+        vshl.u64       q11, q11, #1
+       veor            q2, q2, q10
+        veor           q3, q3, q11
+       vshr.u64        q10, q4, #1
+        vshr.u64       q11, q0, #1
+       veor            q10, q10, q6
+        veor           q11, q11, q1
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q6, q6, q10
+       vshl.u64        q10, q10, #1
+        veor           q1, q1, q11
+        vshl.u64       q11, q11, #1
+       veor            q4, q4, q10
+        veor           q0, q0, q11
+       vmov.i8 q8,#0x0f                        @ compose .LBS2
+       vshr.u64        q10, q7, #2
+        vshr.u64       q11, q3, #2
+       veor            q10, q10, q5
+        veor           q11, q11, q2
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #2
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #2
+       veor            q7, q7, q10
+        veor           q3, q3, q11
+       vshr.u64        q10, q1, #2
+        vshr.u64       q11, q0, #2
+       veor            q10, q10, q6
+        veor           q11, q11, q4
+       vand            q10, q10, q9
+        vand           q11, q11, q9
+       veor            q6, q6, q10
+       vshl.u64        q10, q10, #2
+        veor           q4, q4, q11
+        vshl.u64       q11, q11, #2
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vshr.u64        q10, q6, #4
+        vshr.u64       q11, q4, #4
+       veor            q10, q10, q5
+        veor           q11, q11, q2
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q5, q5, q10
+       vshl.u64        q10, q10, #4
+        veor           q2, q2, q11
+        vshl.u64       q11, q11, #4
+       veor            q6, q6, q10
+        veor           q4, q4, q11
+       vshr.u64        q10, q1, #4
+        vshr.u64       q11, q0, #4
+       veor            q10, q10, q7
+        veor           q11, q11, q3
+       vand            q10, q10, q8
+        vand           q11, q11, q8
+       veor            q7, q7, q10
+       vshl.u64        q10, q10, #4
+        veor           q3, q3, q11
+        vshl.u64       q11, q11, #4
+       veor            q1, q1, q10
+        veor           q0, q0, q11
+       vldmia  r4, {q8}                        @ last round key
+       veor    q4, q4, q8
+       veor    q6, q6, q8
+       veor    q3, q3, q8
+       veor    q7, q7, q8
+       veor    q2, q2, q8
+       veor    q5, q5, q8
+       veor    q0, q0, q8
+       veor    q1, q1, q8
+       bx      lr
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+.type  _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+       adr     r6,_bsaes_key_convert
+       vld1.8  {q7},  [r4]!            @ load round 0 key
+       sub     r6,r6,#_bsaes_key_convert-.LM0
+       vld1.8  {q15}, [r4]!            @ load round 1 key
+
+       vmov.i8 q8,  #0x01                      @ bit masks
+       vmov.i8 q9,  #0x02
+       vmov.i8 q10, #0x04
+       vmov.i8 q11, #0x08
+       vmov.i8 q12, #0x10
+       vmov.i8 q13, #0x20
+       vldmia  r6, {q14}               @ .LM0
+
+#ifdef __ARMEL__
+       vrev32.8        q7,  q7
+       vrev32.8        q15, q15
+#endif
+       sub     r5,r5,#1
+       vstmia  r12!, {q7}              @ save round 0 key
+       b       .Lkey_loop
+
+.align 4
+.Lkey_loop:
+       vtbl.8  d14,{q15},d28
+       vtbl.8  d15,{q15},d29
+       vmov.i8 q6,  #0x40
+       vmov.i8 q15, #0x80
+
+       vtst.8  q0, q7, q8
+       vtst.8  q1, q7, q9
+       vtst.8  q2, q7, q10
+       vtst.8  q3, q7, q11
+       vtst.8  q4, q7, q12
+       vtst.8  q5, q7, q13
+       vtst.8  q6, q7, q6
+       vtst.8  q7, q7, q15
+       vld1.8  {q15}, [r4]!            @ load next round key
+       vmvn    q0, q0          @ "pnot"
+       vmvn    q1, q1
+       vmvn    q5, q5
+       vmvn    q6, q6
+#ifdef __ARMEL__
+       vrev32.8        q15, q15
+#endif
+       subs    r5,r5,#1
+       vstmia  r12!,{q0-q7}            @ write bit-sliced round key
+       bne     .Lkey_loop
+
+       vmov.i8 q7,#0x63                        @ compose .L63
+       @ don't save last round key
+       bx      lr
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global        bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef        __KERNEL__
+       cmp     r2, #128
+#ifndef        __thumb__
+       blo     AES_cbc_encrypt
+#else
+       bhs     1f
+       b       AES_cbc_encrypt
+1:
+#endif
+#endif
+
+       @ it is up to the caller to make sure we are called with enc == 0
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     r8, [ip]                        @ IV is 1st arg on the stack
+       mov     r2, r2, lsr#4           @ len in 16 byte blocks
+       sub     sp, #0x10                       @ scratch space to carry over the IV
+       mov     r9, sp                          @ save sp
+
+       ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+       add     r12, #96                        @ sifze of bit-slices key schedule
+
+       @ populate the key schedule
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       mov     sp, r12                         @ sp is sp
+       bl      _bsaes_key_convert
+       vldmia  sp, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  sp, {q7}
+#else
+       ldr     r12, [r3, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [r3, #244]
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       add     r12, r3, #248                   @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, r3, #248
+       vldmia  r4, {q6}
+       vstmia  r12, {q15}                      @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+
+.align 2
+0:
+#endif
+
+       vld1.8  {q15}, [r8]             @ load IV
+       b       .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+       subs    r2, r2, #0x8
+       bmi     .Lcbc_dec_loop_finish
+
+       vld1.8  {q0-q1}, [r0]!  @ load input
+       vld1.8  {q2-q3}, [r0]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, sp                  @ pass the key
+#else
+       add     r4, r3, #248
+#endif
+       vld1.8  {q4-q5}, [r0]!
+       mov     r5, r10
+       vld1.8  {q6-q7}, [r0]
+       sub     r0, r0, #0x60
+       vstmia  r9, {q15}                       @ put aside IV
+
+       bl      _bsaes_decrypt8
+
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12-q13}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q14-q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q3, q3, q13
+       vst1.8  {q6}, [r1]!
+       veor    q5, q5, q14
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       vst1.8  {q3}, [r1]!
+       vst1.8  {q5}, [r1]!
+
+       b       .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+       adds    r2, r2, #8
+       beq     .Lcbc_dec_done
+
+       vld1.8  {q0}, [r0]!             @ load input
+       cmp     r2, #2
+       blo     .Lcbc_dec_one
+       vld1.8  {q1}, [r0]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, sp                  @ pass the key
+#else
+       add     r4, r3, #248
+#endif
+       mov     r5, r10
+       vstmia  r9, {q15}                       @ put aside IV
+       beq     .Lcbc_dec_two
+       vld1.8  {q2}, [r0]!
+       cmp     r2, #4
+       blo     .Lcbc_dec_three
+       vld1.8  {q3}, [r0]!
+       beq     .Lcbc_dec_four
+       vld1.8  {q4}, [r0]!
+       cmp     r2, #6
+       blo     .Lcbc_dec_five
+       vld1.8  {q5}, [r0]!
+       beq     .Lcbc_dec_six
+       vld1.8  {q6}, [r0]!
+       sub     r0, r0, #0x70
+
+       bl      _bsaes_decrypt8
+
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12-q13}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q3, q3, q13
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       vst1.8  {q3}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+       sub     r0, r0, #0x60
+       bl      _bsaes_decrypt8
+       vldmia  r9,{q14}                        @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q12}, [r0]!
+       veor    q4, q4, q10
+       veor    q2, q2, q11
+       vld1.8  {q15}, [r0]!
+       veor    q7, q7, q12
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       vst1.8  {q7}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+       sub     r0, r0, #0x50
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10-q11}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q15}, [r0]!
+       veor    q4, q4, q10
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       veor    q2, q2, q11
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       vst1.8  {q2}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+       sub     r0, r0, #0x40
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q10}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vld1.8  {q15}, [r0]!
+       veor    q4, q4, q10
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       vst1.8  {q4}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+       sub     r0, r0, #0x30
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8-q9}, [r0]!  @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q15}, [r0]!
+       veor    q1, q1, q8
+       veor    q6, q6, q9
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       vst1.8  {q6}, [r1]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+       sub     r0, r0, #0x20
+       bl      _bsaes_decrypt8
+       vldmia  r9, {q14}                       @ reload IV
+       vld1.8  {q8}, [r0]!             @ reload input
+       veor    q0, q0, q14     @ ^= IV
+       vld1.8  {q15}, [r0]!            @ reload input
+       veor    q1, q1, q8
+       vst1.8  {q0-q1}, [r1]!  @ write output
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+       sub     r0, r0, #0x10
+       mov     r10, r1                 @ save original out pointer
+       mov     r1, r9                  @ use the iv scratch space as out buffer
+       mov     r2, r3
+       vmov    q4,q15          @ just in case ensure that IV
+       vmov    q5,q0                   @ and input are preserved
+       bl      AES_decrypt
+       vld1.8  {q0}, [r9,:64]          @ load result
+       veor    q0, q0, q4      @ ^= IV
+       vmov    q15, q5         @ q5 holds input
+       vst1.8  {q0}, [r10]             @ write output
+
+.Lcbc_dec_done:
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+.Lcbc_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r9
+       bne             .Lcbc_dec_bzero
+#endif
+
+       mov     sp, r9
+       add     sp, #0x10                       @ add sp,r9,#0x10 is no good for thumb
+       vst1.8  {q15}, [r8]             @ return IV
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern        AES_encrypt
+.global        bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+       cmp     r2, #8                  @ use plain AES for
+       blo     .Lctr_enc_short                 @ small sizes
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     r8, [ip]                        @ ctr is 1st arg on the stack
+       sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+       mov     r9, sp                          @ save sp
+
+       ldr     r10, [r3, #240]         @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r10, lsl#7             @ 128 bytes per inner round key
+       add     r12, #96                        @ size of bit-sliced key schedule
+
+       @ populate the key schedule
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       mov     sp, r12                         @ sp is sp
+       bl      _bsaes_key_convert
+       veor    q7,q7,q15       @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+
+       vld1.8  {q0}, [r8]              @ load counter
+       add     r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+       vldmia  sp, {q4}                @ load round0 key
+#else
+       ldr     r12, [r3, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [r3, #244]
+       mov     r4, r3                  @ pass key
+       mov     r5, r10                 @ pass # of rounds
+       add     r12, r3, #248                   @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7,q7,q15       @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+
+.align 2
+0:     add     r12, r3, #248
+       vld1.8  {q0}, [r8]              @ load counter
+       adrl    r8, .LREVM0SR                   @ borrow r8
+       vldmia  r12, {q4}                       @ load round0 key
+       sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+
+       vmov.i32        q8,#1           @ compose 1<<96
+       veor            q9,q9,q9
+       vrev32.8        q0,q0
+       vext.8          q8,q9,q8,#4
+       vrev32.8        q4,q4
+       vadd.u32        q9,q8,q8        @ compose 2<<96
+       vstmia  sp, {q4}                @ save adjusted round0 key
+       b       .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+       vadd.u32        q10, q8, q9     @ compose 3<<96
+       vadd.u32        q1, q0, q8      @ +1
+       vadd.u32        q2, q0, q9      @ +2
+       vadd.u32        q3, q0, q10     @ +3
+       vadd.u32        q4, q1, q10
+       vadd.u32        q5, q2, q10
+       vadd.u32        q6, q3, q10
+       vadd.u32        q7, q4, q10
+       vadd.u32        q10, q5, q10    @ next counter
+
+       @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+       @ to flip byte order in 32-bit counter
+
+       vldmia          sp, {q9}                @ load round0 key
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x10           @ pass next round key
+#else
+       add             r4, r3, #264
+#endif
+       vldmia          r8, {q8}                        @ .LREVM0SR
+       mov             r5, r10                 @ pass rounds
+       vstmia          r9, {q10}                       @ save next counter
+       sub             r6, r8, #.LREVM0SR-.LSR @ pass constants
+
+       bl              _bsaes_encrypt8_alt
+
+       subs            r2, r2, #8
+       blo             .Lctr_enc_loop_done
+
+       vld1.8          {q8-q9}, [r0]!  @ load input
+       vld1.8          {q10-q11}, [r0]!
+       veor            q0, q8
+       veor            q1, q9
+       vld1.8          {q12-q13}, [r0]!
+       veor            q4, q10
+       veor            q6, q11
+       vld1.8          {q14-q15}, [r0]!
+       veor            q3, q12
+       vst1.8          {q0-q1}, [r1]!  @ write output
+       veor            q7, q13
+       veor            q2, q14
+       vst1.8          {q4}, [r1]!
+       veor            q5, q15
+       vst1.8          {q6}, [r1]!
+       vmov.i32        q8, #1                  @ compose 1<<96
+       vst1.8          {q3}, [r1]!
+       veor            q9, q9, q9
+       vst1.8          {q7}, [r1]!
+       vext.8          q8, q9, q8, #4
+       vst1.8          {q2}, [r1]!
+       vadd.u32        q9,q8,q8                @ compose 2<<96
+       vst1.8          {q5}, [r1]!
+       vldmia          r9, {q0}                        @ load counter
+
+       bne             .Lctr_enc_loop
+       b               .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+       add             r2, r2, #8
+       vld1.8          {q8}, [r0]!     @ load input
+       veor            q0, q8
+       vst1.8          {q0}, [r1]!     @ write output
+       cmp             r2, #2
+       blo             .Lctr_enc_done
+       vld1.8          {q9}, [r0]!
+       veor            q1, q9
+       vst1.8          {q1}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q10}, [r0]!
+       veor            q4, q10
+       vst1.8          {q4}, [r1]!
+       cmp             r2, #4
+       blo             .Lctr_enc_done
+       vld1.8          {q11}, [r0]!
+       veor            q6, q11
+       vst1.8          {q6}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q12}, [r0]!
+       veor            q3, q12
+       vst1.8          {q3}, [r1]!
+       cmp             r2, #6
+       blo             .Lctr_enc_done
+       vld1.8          {q13}, [r0]!
+       veor            q7, q13
+       vst1.8          {q7}, [r1]!
+       beq             .Lctr_enc_done
+       vld1.8          {q14}, [r0]
+       veor            q2, q14
+       vst1.8          {q2}, [r1]!
+
+.Lctr_enc_done:
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifndef        BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                       @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r9
+       bne             .Lctr_enc_bzero
+#else
+       vstmia          sp, {q0-q1}
+#endif
+
+       mov     sp, r9
+       add     sp, #0x10               @ add sp,r9,#0x10 is no good for thumb
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}       @ return
+
+.align 4
+.Lctr_enc_short:
+       ldr     ip, [sp]                @ ctr pointer is passed on stack
+       stmdb   sp!, {r4-r8, lr}
+
+       mov     r4, r0          @ copy arguments
+       mov     r5, r1
+       mov     r6, r2
+       mov     r7, r3
+       ldr     r8, [ip, #12]           @ load counter LSW
+       vld1.8  {q1}, [ip]              @ load whole counter value
+#ifdef __ARMEL__
+       rev     r8, r8
+#endif
+       sub     sp, sp, #0x10
+       vst1.8  {q1}, [sp,:64]  @ copy counter value
+       sub     sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+       add     r0, sp, #0x10           @ input counter value
+       mov     r1, sp                  @ output on the stack
+       mov     r2, r7                  @ key
+
+       bl      AES_encrypt
+
+       vld1.8  {q0}, [r4]!     @ load input
+       vld1.8  {q1}, [sp,:64]  @ load encrypted counter
+       add     r8, r8, #1
+#ifdef __ARMEL__
+       rev     r0, r8
+       str     r0, [sp, #0x1c]         @ next counter value
+#else
+       str     r8, [sp, #0x1c]         @ next counter value
+#endif
+       veor    q0,q0,q1
+       vst1.8  {q0}, [r5]!     @ store output
+       subs    r6, r6, #1
+       bne     .Lctr_enc_short_loop
+
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+       vstmia          sp!, {q0-q1}
+
+       ldmia   sp!, {r4-r8, pc}
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future r3
+
+       mov     r7, r0
+       mov     r8, r1
+       mov     r9, r2
+       mov     r10, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0,sp                           @ pointer to initial tweak
+#endif
+
+       ldr     r1, [r10, #240]         @ get # of rounds
+       mov     r3, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+       @ add   r12, #96                        @ size of bit-sliced key schedule
+       sub     r12, #48                        @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7, q7, q15     @ fix up last round key
+       vstmia  r12, {q7}                       @ save last round key
+#else
+       ldr     r12, [r10, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [r10, #244]
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       add     r12, r10, #248                  @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    q7, q7, q15     @ fix up last round key
+       vstmia  r12, {q7}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+
+       vld1.8  {q8}, [r0]                      @ initial tweak
+       adr     r2, .Lxts_magic
+
+       subs    r9, #0x80
+       blo     .Lxts_enc_short
+       b       .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q6, q8, #63
+       mov             r0, sp
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q9, #63
+       veor            q9, q9, q6
+       vand            q7, q7, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q10, #63
+       veor            q10, q10, q7
+       vand            q6, q6, q5
+       vld1.8          {q0}, [r7]!
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q11, #63
+       veor            q11, q11, q6
+       vand            q7, q7, q5
+       vld1.8          {q1}, [r7]!
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q12, #63
+       veor            q12, q12, q7
+       vand            q6, q6, q5
+       vld1.8          {q2}, [r7]!
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q13, #63
+       veor            q13, q13, q6
+       vand            q7, q7, q5
+       vld1.8          {q3}, [r7]!
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q14, #63
+       veor            q14, q14, q7
+       vand            q6, q6, q5
+       vld1.8          {q4}, [r7]!
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q15, #63
+       veor            q15, q15, q6
+       vand            q7, q7, q5
+       vld1.8          {q5}, [r7]!
+       veor            q4, q4, q12
+       vadd.u64        q8, q15, q15
+       vst1.64         {q15}, [r0,:128]!
+       vswp            d15,d14
+       veor            q8, q8, q7
+       vst1.64         {q8}, [r0,:128]         @ next round tweak
+
+       vld1.8          {q6-q7}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       veor            q7, q7, q15
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vld1.64         {q14-q15}, [r0,:128]!
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q2, q14
+       vst1.8          {q10-q11}, [r8]!
+       veor            q13, q5, q15
+       vst1.8          {q12-q13}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+
+       subs            r9, #0x80
+       bpl             .Lxts_enc_loop
+
+.Lxts_enc_short:
+       adds            r9, #0x70
+       bmi             .Lxts_enc_done
+
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q7, q8, #63
+       mov             r0, sp
+       vand            q7, q7, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q9, #63
+       veor            q9, q9, q7
+       vand            q6, q6, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q10, #63
+       veor            q10, q10, q6
+       vand            q7, q7, q5
+       vld1.8          {q0}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_1
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q11, #63
+       veor            q11, q11, q7
+       vand            q6, q6, q5
+       vld1.8          {q1}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_2
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q12, #63
+       veor            q12, q12, q6
+       vand            q7, q7, q5
+       vld1.8          {q2}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_3
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q13, #63
+       veor            q13, q13, q7
+       vand            q6, q6, q5
+       vld1.8          {q3}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_4
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q14, #63
+       veor            q14, q14, q6
+       vand            q7, q7, q5
+       vld1.8          {q4}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_5
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q15, #63
+       veor            q15, q15, q7
+       vand            q6, q6, q5
+       vld1.8          {q5}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_enc_6
+       veor            q4, q4, q12
+       sub             r9, #0x10
+       vst1.64         {q15}, [r0,:128]                @ next round tweak
+
+       vld1.8          {q6}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vld1.64         {q14}, [r0,:128]!
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q2, q14
+       vst1.8          {q10-q11}, [r8]!
+       vst1.8          {q12}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+       vst1.64         {q14}, [r0,:128]                @ next round tweak
+
+       veor            q4, q4, q12
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q5, q5, q13
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       vst1.8          {q10-q11}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+       .quad   1, 0x87
+
+.align 5
+.Lxts_enc_5:
+       vst1.64         {q13}, [r0,:128]                @ next round tweak
+
+       veor            q3, q3, q11
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q4, q4, q12
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       veor            q10, q3, q12
+       vst1.8          {q8-q9}, [r8]!
+       vst1.8          {q10}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+       vst1.64         {q12}, [r0,:128]                @ next round tweak
+
+       veor            q2, q2, q10
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q3, q3, q11
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q6, q11
+       vst1.8          {q8-q9}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+       vst1.64         {q11}, [r0,:128]                @ next round tweak
+
+       veor            q1, q1, q9
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q2, q2, q10
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q4, q10
+       vst1.8          {q0-q1}, [r8]!
+       vst1.8          {q8}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+       vst1.64         {q10}, [r0,:128]                @ next round tweak
+
+       veor            q0, q0, q8
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q1, q1, q9
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       vst1.8          {q0-q1}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                          @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r8]!
+       mov             r3, r4
+
+       vmov            q8, q9          @ next round tweak
+
+.Lxts_enc_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            r9, #0x10
+       beq             .Lxts_enc_ret
+       sub             r6, r8, #0x10
+
+.Lxts_enc_steal:
+       ldrb            r0, [r7], #1
+       ldrb            r1, [r8, #-0x10]
+       strb            r0, [r8, #-0x10]
+       strb            r1, [r8], #1
+
+       subs            r9, #1
+       bhi             .Lxts_enc_steal
+
+       vld1.8          {q0}, [r6]
+       mov             r0, sp
+       veor            q0, q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                  @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r6]
+       mov             r3, r4
+#endif
+
+.Lxts_enc_ret:
+       bic             r0, r3, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_enc_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_enc_bzero
+
+       mov             sp, r3
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {q8}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future r3
+
+       mov     r7, r0
+       mov     r8, r1
+       mov     r9, r2
+       mov     r10, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0, sp                          @ pointer to initial tweak
+#endif
+
+       ldr     r1, [r10, #240]         @ get # of rounds
+       mov     r3, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, r1, lsl#7              @ 128 bytes per inner round key
+       @ add   r12, #96                        @ size of bit-sliced key schedule
+       sub     r12, #48                        @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, sp, #0x90
+       vldmia  r4, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+#else
+       ldr     r12, [r10, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [r10, #244]
+       mov     r4, r10                 @ pass key
+       mov     r5, r1                  @ pass # of rounds
+       add     r12, r10, #248                  @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, r10, #248
+       vldmia  r4, {q6}
+       vstmia  r12,  {q15}             @ save last round key
+       veor    q7, q7, q6      @ fix up round 0 key
+       vstmia  r4, {q7}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+       vld1.8  {q8}, [r0]                      @ initial tweak
+       adr     r2, .Lxts_magic
+
+       tst     r9, #0xf                        @ if not multiple of 16
+       it      ne                              @ Thumb2 thing, sanity check in ARM
+       subne   r9, #0x10                       @ subtract another 16 bytes
+       subs    r9, #0x80
+
+       blo     .Lxts_dec_short
+       b       .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q6, q8, #63
+       mov             r0, sp
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q9, #63
+       veor            q9, q9, q6
+       vand            q7, q7, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q10, #63
+       veor            q10, q10, q7
+       vand            q6, q6, q5
+       vld1.8          {q0}, [r7]!
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q11, #63
+       veor            q11, q11, q6
+       vand            q7, q7, q5
+       vld1.8          {q1}, [r7]!
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q12, #63
+       veor            q12, q12, q7
+       vand            q6, q6, q5
+       vld1.8          {q2}, [r7]!
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q13, #63
+       veor            q13, q13, q6
+       vand            q7, q7, q5
+       vld1.8          {q3}, [r7]!
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q14, #63
+       veor            q14, q14, q7
+       vand            q6, q6, q5
+       vld1.8          {q4}, [r7]!
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q15, #63
+       veor            q15, q15, q6
+       vand            q7, q7, q5
+       vld1.8          {q5}, [r7]!
+       veor            q4, q4, q12
+       vadd.u64        q8, q15, q15
+       vst1.64         {q15}, [r0,:128]!
+       vswp            d15,d14
+       veor            q8, q8, q7
+       vst1.64         {q8}, [r0,:128]         @ next round tweak
+
+       vld1.8          {q6-q7}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       veor            q7, q7, q15
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vld1.64         {q14-q15}, [r0,:128]!
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q3, q14
+       vst1.8          {q10-q11}, [r8]!
+       veor            q13, q5, q15
+       vst1.8          {q12-q13}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+
+       subs            r9, #0x80
+       bpl             .Lxts_dec_loop
+
+.Lxts_dec_short:
+       adds            r9, #0x70
+       bmi             .Lxts_dec_done
+
+       vldmia          r2, {q5}        @ load XTS magic
+       vshr.s64        q7, q8, #63
+       mov             r0, sp
+       vand            q7, q7, q5
+       vadd.u64        q9, q8, q8
+       vst1.64         {q8}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q9, #63
+       veor            q9, q9, q7
+       vand            q6, q6, q5
+       vadd.u64        q10, q9, q9
+       vst1.64         {q9}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q10, #63
+       veor            q10, q10, q6
+       vand            q7, q7, q5
+       vld1.8          {q0}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_1
+       vadd.u64        q11, q10, q10
+       vst1.64         {q10}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q11, #63
+       veor            q11, q11, q7
+       vand            q6, q6, q5
+       vld1.8          {q1}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_2
+       veor            q0, q0, q8
+       vadd.u64        q12, q11, q11
+       vst1.64         {q11}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q12, #63
+       veor            q12, q12, q6
+       vand            q7, q7, q5
+       vld1.8          {q2}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_3
+       veor            q1, q1, q9
+       vadd.u64        q13, q12, q12
+       vst1.64         {q12}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q13, #63
+       veor            q13, q13, q7
+       vand            q6, q6, q5
+       vld1.8          {q3}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_4
+       veor            q2, q2, q10
+       vadd.u64        q14, q13, q13
+       vst1.64         {q13}, [r0,:128]!
+       vswp            d13,d12
+       vshr.s64        q7, q14, #63
+       veor            q14, q14, q6
+       vand            q7, q7, q5
+       vld1.8          {q4}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_5
+       veor            q3, q3, q11
+       vadd.u64        q15, q14, q14
+       vst1.64         {q14}, [r0,:128]!
+       vswp            d15,d14
+       vshr.s64        q6, q15, #63
+       veor            q15, q15, q7
+       vand            q6, q6, q5
+       vld1.8          {q5}, [r7]!
+       subs            r9, #0x10
+       bmi             .Lxts_dec_6
+       veor            q4, q4, q12
+       sub             r9, #0x10
+       vst1.64         {q15}, [r0,:128]                @ next round tweak
+
+       vld1.8          {q6}, [r7]!
+       veor            q5, q5, q13
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q6, q6, q14
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vld1.64         {q14}, [r0,:128]!
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       veor            q12, q3, q14
+       vst1.8          {q10-q11}, [r8]!
+       vst1.8          {q12}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+       vst1.64         {q14}, [r0,:128]                @ next round tweak
+
+       veor            q4, q4, q12
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q5, q5, q13
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12-q13}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       veor            q11, q7, q13
+       vst1.8          {q10-q11}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+       vst1.64         {q13}, [r0,:128]                @ next round tweak
+
+       veor            q3, q3, q11
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q4, q4, q12
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       vld1.64         {q12}, [r0,:128]!
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       veor            q10, q2, q12
+       vst1.8          {q8-q9}, [r8]!
+       vst1.8          {q10}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+       vst1.64         {q12}, [r0,:128]                @ next round tweak
+
+       veor            q2, q2, q10
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q3, q3, q11
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10-q11}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       veor            q9, q4, q11
+       vst1.8          {q8-q9}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+       vst1.64         {q11}, [r0,:128]                @ next round tweak
+
+       veor            q1, q1, q9
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q2, q2, q10
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       vld1.64         {q10}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       veor            q8, q6, q10
+       vst1.8          {q0-q1}, [r8]!
+       vst1.8          {q8}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+       vst1.64         {q10}, [r0,:128]                @ next round tweak
+
+       veor            q0, q0, q8
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, r10, #248                   @ pass key schedule
+#endif
+       veor            q1, q1, q9
+       mov             r5, r1                  @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {q8-q9}, [r0,:128]!
+       veor            q0, q0, q8
+       veor            q1, q1, q9
+       vst1.8          {q0-q1}, [r8]!
+
+       vld1.64         {q8}, [r0,:128]         @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                          @ preserve fp
+       mov             r5, r2                  @ preserve magic
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r8]!
+       mov             r3, r4
+       mov             r2, r5
+
+       vmov            q8, q9          @ next round tweak
+
+.Lxts_dec_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            r9, #0x10
+       beq             .Lxts_dec_ret
+
+       @ calculate one round of extra tweak for the stolen ciphertext
+       vldmia          r2, {q5}
+       vshr.s64        q6, q8, #63
+       vand            q6, q6, q5
+       vadd.u64        q9, q8, q8
+       vswp            d13,d12
+       veor            q9, q9, q6
+
+       @ perform the final decryption with the last tweak value
+       vld1.8          {q0}, [r7]!
+       mov             r0, sp
+       veor            q0, q0, q9
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+       mov             r4, r3                  @ preserve fp
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q9
+       vst1.8          {q0}, [r8]
+
+       mov             r6, r8
+.Lxts_dec_steal:
+       ldrb            r1, [r8]
+       ldrb            r0, [r7], #1
+       strb            r1, [r8, #0x10]
+       strb            r0, [r8], #1
+
+       subs            r9, #1
+       bhi             .Lxts_dec_steal
+
+       vld1.8          {q0}, [r6]
+       mov             r0, sp
+       veor            q0, q8
+       mov             r1, sp
+       vst1.8          {q0}, [sp,:128]
+       mov             r2, r10
+
+       bl              AES_decrypt
+
+       vld1.8          {q0}, [sp,:128]
+       veor            q0, q0, q8
+       vst1.8          {q0}, [r6]
+       mov             r3, r4
+#endif
+
+.Lxts_dec_ret:
+       bic             r0, r3, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [r3, #0x20+VFP_ABI_FRAME]   @ chain tweak
+#endif
+.Lxts_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_dec_bzero
+
+       mov             sp, r3
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {q8}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
new file mode 100644 (file)
index 0000000..4522366
--- /dev/null
@@ -0,0 +1,434 @@
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+
+#include "aes_glue.h"
+
+#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+
+struct BS_KEY {
+       struct AES_KEY  rk;
+       int             converted;
+       u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 iv[]);
+
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+                                          struct BS_KEY *key, u8 const iv[]);
+
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 tweak[]);
+
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+                                 struct BS_KEY *key, u8 tweak[]);
+
+struct aesbs_cbc_ctx {
+       struct AES_KEY  enc;
+       struct BS_KEY   dec;
+};
+
+struct aesbs_ctr_ctx {
+       struct BS_KEY   enc;
+};
+
+struct aesbs_xts_ctx {
+       struct BS_KEY   enc;
+       struct BS_KEY   dec;
+       struct AES_KEY  twkey;
+};
+
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 8;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->dec.rk = ctx->enc;
+       private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+       ctx->dec.converted = 0;
+       return 0;
+}
+
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 8;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->enc.converted = 0;
+       return 0;
+}
+
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                            unsigned int key_len)
+{
+       struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+       int bits = key_len * 4;
+
+       if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+               return -EINVAL;
+       }
+       ctx->dec.rk = ctx->enc.rk;
+       private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+       private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+       ctx->enc.converted = ctx->dec.converted = 0;
+       return 0;
+}
+
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt(desc, &walk);
+
+       while (walk.nbytes) {
+               u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+               u8 *src = walk.src.virt.addr;
+
+               if (walk.dst.virt.addr == walk.src.virt.addr) {
+                       u8 *iv = walk.iv;
+
+                       do {
+                               crypto_xor(src, iv, AES_BLOCK_SIZE);
+                               AES_encrypt(src, src, &ctx->enc);
+                               iv = src;
+                               src += AES_BLOCK_SIZE;
+                       } while (--blocks);
+                       memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+               } else {
+                       u8 *dst = walk.dst.virt.addr;
+
+                       do {
+                               crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+                               AES_encrypt(walk.iv, dst, &ctx->enc);
+                               memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+                               src += AES_BLOCK_SIZE;
+                               dst += AES_BLOCK_SIZE;
+                       } while (--blocks);
+               }
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+               kernel_neon_begin();
+               bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->dec, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       while (walk.nbytes) {
+               u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+               u8 *dst = walk.dst.virt.addr;
+               u8 *src = walk.src.virt.addr;
+               u8 bk[2][AES_BLOCK_SIZE];
+               u8 *iv = walk.iv;
+
+               do {
+                       if (walk.dst.virt.addr == walk.src.virt.addr)
+                               memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+
+                       AES_decrypt(src, dst, &ctx->dec.rk);
+                       crypto_xor(dst, iv, AES_BLOCK_SIZE);
+
+                       if (walk.dst.virt.addr == walk.src.virt.addr)
+                               iv = bk[blocks & 1];
+                       else
+                               iv = src;
+
+                       dst += AES_BLOCK_SIZE;
+                       src += AES_BLOCK_SIZE;
+               } while (--blocks);
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+       int i;
+
+       for (i = 3; i >= 0; i--, addend = 1) {
+               u32 n = be32_to_cpu(ctr[i]) + addend;
+
+               ctr[i] = cpu_to_be32(n);
+               if (n >= addend)
+                       break;
+       }
+}
+
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst, struct scatterlist *src,
+                            unsigned int nbytes)
+{
+       struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       u32 blocks;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+               u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+               __be32 *ctr = (__be32 *)walk.iv;
+               u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+
+               /* avoid 32 bit counter overflow in the NEON code */
+               if (unlikely(headroom < blocks)) {
+                       blocks = headroom + 1;
+                       tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+               }
+               kernel_neon_begin();
+               bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+                                          walk.dst.virt.addr, blocks,
+                                          &ctx->enc, walk.iv);
+               kernel_neon_end();
+               inc_be128_ctr(ctr, blocks);
+
+               nbytes -= blocks * AES_BLOCK_SIZE;
+               if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+                       break;
+
+               err = blkcipher_walk_done(desc, &walk, tail);
+       }
+       if (walk.nbytes) {
+               u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+               u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+               u8 ks[AES_BLOCK_SIZE];
+
+               AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+               if (tdst != tsrc)
+                       memcpy(tdst, tsrc, nbytes);
+               crypto_xor(tdst, ks, nbytes);
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       /* generate the initial tweak */
+       AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+       while (walk.nbytes) {
+               kernel_neon_begin();
+               bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->enc, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+                            struct scatterlist *dst,
+                            struct scatterlist *src, unsigned int nbytes)
+{
+       struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct blkcipher_walk walk;
+       int err;
+
+       blkcipher_walk_init(&walk, dst, src, nbytes);
+       err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+       /* generate the initial tweak */
+       AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+       while (walk.nbytes) {
+               kernel_neon_begin();
+               bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+                                 walk.nbytes, &ctx->dec, walk.iv);
+               kernel_neon_end();
+               err = blkcipher_walk_done(desc, &walk, 0);
+       }
+       return err;
+}
+
+static struct crypto_alg aesbs_algs[] = { {
+       .cra_name               = "__cbc-aes-neonbs",
+       .cra_driver_name        = "__driver-cbc-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct aesbs_cbc_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_cbc_set_key,
+               .encrypt        = aesbs_cbc_encrypt,
+               .decrypt        = aesbs_cbc_decrypt,
+       },
+}, {
+       .cra_name               = "__ctr-aes-neonbs",
+       .cra_driver_name        = "__driver-ctr-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = 1,
+       .cra_ctxsize            = sizeof(struct aesbs_ctr_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_ctr_set_key,
+               .encrypt        = aesbs_ctr_encrypt,
+               .decrypt        = aesbs_ctr_encrypt,
+       },
+}, {
+       .cra_name               = "__xts-aes-neonbs",
+       .cra_driver_name        = "__driver-xts-aes-neonbs",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct aesbs_xts_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_blkcipher = {
+               .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+               .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = aesbs_xts_set_key,
+               .encrypt        = aesbs_xts_encrypt,
+               .decrypt        = aesbs_xts_decrypt,
+       },
+}, {
+       .cra_name               = "cbc(aes)",
+       .cra_driver_name        = "cbc-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = __ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+}, {
+       .cra_name               = "ctr(aes)",
+       .cra_driver_name        = "ctr-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = 1,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = AES_MIN_KEY_SIZE,
+               .max_keysize    = AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+}, {
+       .cra_name               = "xts(aes)",
+       .cra_driver_name        = "xts-aes-neonbs",
+       .cra_priority           = 300,
+       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+       .cra_blocksize          = AES_BLOCK_SIZE,
+       .cra_ctxsize            = sizeof(struct async_helper_ctx),
+       .cra_alignmask          = 7,
+       .cra_type               = &crypto_ablkcipher_type,
+       .cra_module             = THIS_MODULE,
+       .cra_init               = ablk_init,
+       .cra_exit               = ablk_exit,
+       .cra_ablkcipher = {
+               .min_keysize    = 2 * AES_MIN_KEY_SIZE,
+               .max_keysize    = 2 * AES_MAX_KEY_SIZE,
+               .ivsize         = AES_BLOCK_SIZE,
+               .setkey         = ablk_set_key,
+               .encrypt        = ablk_encrypt,
+               .decrypt        = ablk_decrypt,
+       }
+} };
+
+static int __init aesbs_mod_init(void)
+{
+       if (!cpu_has_neon())
+               return -ENODEV;
+
+       return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+static void __exit aesbs_mod_exit(void)
+{
+       crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644 (file)
index 0000000..f3d96d9
--- /dev/null
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt      19.5 cycles per byte processed with 128-bit key
+# decrypt      22.1 cycles per byte processed with 128-bit key
+# key conv.    440  cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+#                                              <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+#                                      <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InBasisChange  (@b);
+       &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
+       &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+       veor    @b[2], @b[2], @b[1]
+       veor    @b[5], @b[5], @b[6]
+       veor    @b[3], @b[3], @b[0]
+       veor    @b[6], @b[6], @b[2]
+       veor    @b[5], @b[5], @b[0]
+
+       veor    @b[6], @b[6], @b[3]
+       veor    @b[3], @b[3], @b[7]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[3], @b[3], @b[4]
+       veor    @b[4], @b[4], @b[5]
+
+       veor    @b[2], @b[2], @b[7]
+       veor    @b[3], @b[3], @b[1]
+       veor    @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+       veor    @b[0], @b[0], @b[6]
+       veor    @b[1], @b[1], @b[4]
+       veor    @b[4], @b[4], @b[6]
+       veor    @b[2], @b[2], @b[0]
+       veor    @b[6], @b[6], @b[1]
+
+       veor    @b[1], @b[1], @b[5]
+       veor    @b[5], @b[5], @b[3]
+       veor    @b[3], @b[3], @b[7]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[2], @b[2], @b[5]
+
+       veor    @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb         > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb        > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InvInBasisChange       (@b);
+       &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
+       &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {         # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+        veor   @b[1], @b[1], @b[7]
+       veor    @b[4], @b[4], @b[7]
+
+       veor    @b[7], @b[7], @b[5]
+        veor   @b[1], @b[1], @b[3]
+       veor    @b[2], @b[2], @b[5]
+       veor    @b[3], @b[3], @b[7]
+
+       veor    @b[6], @b[6], @b[1]
+       veor    @b[2], @b[2], @b[0]
+        veor   @b[5], @b[5], @b[3]
+       veor    @b[4], @b[4], @b[6]
+       veor    @b[0], @b[0], @b[6]
+       veor    @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange {                # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+       veor    @b[1], @b[1], @b[5]
+       veor    @b[2], @b[2], @b[7]
+
+       veor    @b[3], @b[3], @b[1]
+       veor    @b[4], @b[4], @b[5]
+       veor    @b[7], @b[7], @b[5]
+       veor    @b[3], @b[3], @b[4]
+        veor   @b[5], @b[5], @b[0]
+       veor    @b[3], @b[3], @b[7]
+        veor   @b[6], @b[6], @b[2]
+        veor   @b[2], @b[2], @b[1]
+       veor    @b[6], @b[6], @b[3]
+
+       veor    @b[3], @b[3], @b[0]
+       veor    @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+       vand    $t0, $t0, $x0
+       veor    $x0, $x0, $x1
+       vand    $t1, $x1, $y0
+       vand    $x0, $x0, $y1
+       veor    $x1, $t1, $t0
+       veor    $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N {                                # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+       vand    $t0, $t0, $x0
+       veor    $x0, $x0, $x1
+       vand    $x1, $x1, $y0
+       vand    $x0, $x0, $y1
+       veor    $x1, $x1, $x0
+       veor    $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+       veor    $t0, $y0, $y1
+        veor   $t1, $y2, $y3
+       vand    $t0, $t0, $x0
+        vand   $t1, $t1, $x2
+       veor    $x0, $x0, $x1
+        veor   $x2, $x2, $x3
+       vand    $x1, $x1, $y0
+        vand   $x3, $x3, $y2
+       vand    $x0, $x0, $y1
+        vand   $x2, $x2, $y3
+       veor    $x1, $x1, $x0
+        veor   $x2, $x2, $x3
+       veor    $x0, $x0, $t0
+        veor   $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+       veor    @t[0], @x[0], @x[2]
+       veor    @t[1], @x[1], @x[3]
+___
+       &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+       veor    @y[0], @y[0], @y[2]
+       veor    @y[1], @y[1], @y[3]
+___
+       Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       veor    @x[0], @x[0], @t[0]
+       veor    @x[2], @x[2], @t[0]
+       veor    @x[1], @x[1], @t[1]
+       veor    @x[3], @x[3], @t[1]
+
+       veor    @t[0], @x[4], @x[6]
+       veor    @t[1], @x[5], @x[7]
+___
+       &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       veor    @y[0], @y[0], @y[2]
+       veor    @y[1], @y[1], @y[3]
+___
+       &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+       veor    @x[4], @x[4], @t[0]
+       veor    @x[6], @x[6], @t[0]
+       veor    @x[5], @x[5], @t[1]
+       veor    @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+       veor    @t[3], @x[4], @x[6]
+       veor    @t[2], @x[5], @x[7]
+       veor    @t[1], @x[1], @x[3]
+       veor    @s[1], @x[7], @x[6]
+        vmov   @t[0], @t[2]
+       veor    @s[0], @x[0], @x[2]
+
+       vorr    @t[2], @t[2], @t[1]
+       veor    @s[3], @t[3], @t[0]
+       vand    @s[2], @t[3], @s[0]
+       vorr    @t[3], @t[3], @s[0]
+       veor    @s[0], @s[0], @t[1]
+       vand    @t[0], @t[0], @t[1]
+       veor    @t[1], @x[3], @x[2]
+       vand    @s[3], @s[3], @s[0]
+       vand    @s[1], @s[1], @t[1]
+       veor    @t[1], @x[4], @x[5]
+       veor    @s[0], @x[1], @x[0]
+       veor    @t[3], @t[3], @s[1]
+       veor    @t[2], @t[2], @s[1]
+       vand    @s[1], @t[1], @s[0]
+       vorr    @t[1], @t[1], @s[0]
+       veor    @t[3], @t[3], @s[3]
+       veor    @t[0], @t[0], @s[1]
+       veor    @t[2], @t[2], @s[2]
+       veor    @t[1], @t[1], @s[3]
+       veor    @t[0], @t[0], @s[2]
+       vand    @s[0], @x[7], @x[3]
+       veor    @t[1], @t[1], @s[2]
+       vand    @s[1], @x[6], @x[2]
+       vand    @s[2], @x[5], @x[1]
+       vorr    @s[3], @x[4], @x[0]
+       veor    @t[3], @t[3], @s[0]
+       veor    @t[1], @t[1], @s[2]
+       veor    @t[0], @t[0], @s[3]
+       veor    @t[2], @t[2], @s[1]
+
+       @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+       @ new smaller inversion
+
+       vand    @s[2], @t[3], @t[1]
+       vmov    @s[0], @t[0]
+
+       veor    @s[1], @t[2], @s[2]
+       veor    @s[3], @t[0], @s[2]
+       veor    @s[2], @t[0], @s[2]     @ @s[2]=@s[3]
+
+       vbsl    @s[1], @t[1], @t[0]
+       vbsl    @s[3], @t[3], @t[2]
+       veor    @t[3], @t[3], @t[2]
+
+       vbsl    @s[0], @s[1], @s[2]
+       vbsl    @t[0], @s[2], @s[1]
+
+       vand    @s[2], @s[0], @s[3]
+       veor    @t[1], @t[1], @t[0]
+
+       veor    @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+       &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+       vldmia  $key!, {@t[0]-@t[3]}
+       veor    @t[0], @t[0], @x[0]
+       veor    @t[1], @t[1], @x[1]
+       vtbl.8  `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[0]}
+       veor    @t[2], @t[2], @x[2]
+       vtbl.8  `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[1]}
+       veor    @t[3], @t[3], @x[3]
+       vtbl.8  `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[2]}
+       vtbl.8  `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+       vldmia  $key!, {@t[3]}
+       veor    @t[0], @t[0], @x[4]
+       veor    @t[1], @t[1], @x[5]
+       vtbl.8  `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+       veor    @t[2], @t[2], @x[6]
+       vtbl.8  `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+       veor    @t[3], @t[3], @x[7]
+       vtbl.8  `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+       vtbl.8  `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+       vtbl.8  `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16];        # optional
+$code.=<<___;
+       vext.8  @t[0], @x[0], @x[0], #12        @ x0 <<< 32
+       vext.8  @t[1], @x[1], @x[1], #12
+        veor   @x[0], @x[0], @t[0]             @ x0 ^ (x0 <<< 32)
+       vext.8  @t[2], @x[2], @x[2], #12
+        veor   @x[1], @x[1], @t[1]
+       vext.8  @t[3], @x[3], @x[3], #12
+        veor   @x[2], @x[2], @t[2]
+       vext.8  @t[4], @x[4], @x[4], #12
+        veor   @x[3], @x[3], @t[3]
+       vext.8  @t[5], @x[5], @x[5], #12
+        veor   @x[4], @x[4], @t[4]
+       vext.8  @t[6], @x[6], @x[6], #12
+        veor   @x[5], @x[5], @t[5]
+       vext.8  @t[7], @x[7], @x[7], #12
+        veor   @x[6], @x[6], @t[6]
+
+       veor    @t[1], @t[1], @x[0]
+        veor   @x[7], @x[7], @t[7]
+        vext.8 @x[0], @x[0], @x[0], #8         @ (x0 ^ (x0 <<< 32)) <<< 64)
+       veor    @t[2], @t[2], @x[1]
+       veor    @t[0], @t[0], @x[7]
+       veor    @t[1], @t[1], @x[7]
+        vext.8 @x[1], @x[1], @x[1], #8
+       veor    @t[5], @t[5], @x[4]
+        veor   @x[0], @x[0], @t[0]
+       veor    @t[6], @t[6], @x[5]
+        veor   @x[1], @x[1], @t[1]
+        vext.8 @t[0], @x[4], @x[4], #8
+       veor    @t[4], @t[4], @x[3]
+        vext.8 @t[1], @x[5], @x[5], #8
+       veor    @t[7], @t[7], @x[6]
+        vext.8 @x[4], @x[3], @x[3], #8
+       veor    @t[3], @t[3], @x[2]
+        vext.8 @x[5], @x[7], @x[7], #8
+       veor    @t[4], @t[4], @x[7]
+        vext.8 @x[3], @x[6], @x[6], #8
+       veor    @t[3], @t[3], @x[7]
+        vext.8 @x[6], @x[2], @x[2], #8
+       veor    @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+       veor    @x[2], @t[0], @t[4]
+       veor    @x[4], @x[4], @t[3]
+       veor    @x[5], @x[5], @t[7]
+       veor    @x[3], @x[3], @t[6]
+        @ vmov @x[2], @t[0]
+       veor    @x[6], @x[6], @t[2]
+        @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+       veor    @t[3], @t[3], @x[4]
+       veor    @x[5], @x[5], @t[7]
+       veor    @x[2], @x[3], @t[6]
+       veor    @x[3], @t[0], @t[4]
+       veor    @x[4], @x[6], @t[2]
+       vmov    @x[6], @t[3]
+        @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+       @ multiplication by 0x0e
+       vext.8  @t[7], @x[7], @x[7], #12
+       vmov    @t[2], @x[2]
+       veor    @x[2], @x[2], @x[5]             @ 2 5
+       veor    @x[7], @x[7], @x[5]             @ 7 5
+       vext.8  @t[0], @x[0], @x[0], #12
+       vmov    @t[5], @x[5]
+       veor    @x[5], @x[5], @x[0]             @ 5 0           [1]
+       veor    @x[0], @x[0], @x[1]             @ 0 1
+       vext.8  @t[1], @x[1], @x[1], #12
+       veor    @x[1], @x[1], @x[2]             @ 1 25
+       veor    @x[0], @x[0], @x[6]             @ 01 6          [2]
+       vext.8  @t[3], @x[3], @x[3], #12
+       veor    @x[1], @x[1], @x[3]             @ 125 3         [4]
+       veor    @x[2], @x[2], @x[0]             @ 25 016        [3]
+       veor    @x[3], @x[3], @x[7]             @ 3 75
+       veor    @x[7], @x[7], @x[6]             @ 75 6          [0]
+       vext.8  @t[6], @x[6], @x[6], #12
+       vmov    @t[4], @x[4]
+       veor    @x[6], @x[6], @x[4]             @ 6 4
+       veor    @x[4], @x[4], @x[3]             @ 4 375         [6]
+       veor    @x[3], @x[3], @x[7]             @ 375 756=36
+       veor    @x[6], @x[6], @t[5]             @ 64 5          [7]
+       veor    @x[3], @x[3], @t[2]             @ 36 2
+       vext.8  @t[5], @t[5], @t[5], #12
+       veor    @x[3], @x[3], @t[4]             @ 362 4         [5]
+___
+                                       my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+       @ multiplication by 0x0b
+       veor    @y[1], @y[1], @y[0]
+       veor    @y[0], @y[0], @t[0]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[1], @y[1], @t[1]
+       veor    @y[0], @y[0], @t[5]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[1], @y[1], @t[6]
+       veor    @y[0], @y[0], @t[7]
+       veor    @t[7], @t[7], @t[6]             @ clobber t[7]
+
+       veor    @y[3], @y[3], @t[0]
+        veor   @y[1], @y[1], @y[0]
+       vext.8  @t[0], @t[0], @t[0], #12
+       veor    @y[2], @y[2], @t[1]
+       veor    @y[4], @y[4], @t[1]
+       vext.8  @t[1], @t[1], @t[1], #12
+       veor    @y[2], @y[2], @t[2]
+       veor    @y[3], @y[3], @t[2]
+       veor    @y[5], @y[5], @t[2]
+       veor    @y[2], @y[2], @t[7]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[3], @y[3], @t[3]
+       veor    @y[6], @y[6], @t[3]
+       veor    @y[4], @y[4], @t[3]
+       veor    @y[7], @y[7], @t[4]
+       vext.8  @t[3], @t[3], @t[3], #12
+       veor    @y[5], @y[5], @t[4]
+       veor    @y[7], @y[7], @t[7]
+       veor    @t[7], @t[7], @t[5]             @ clobber t[7] even more
+       veor    @y[3], @y[3], @t[5]
+       veor    @y[4], @y[4], @t[4]
+
+       veor    @y[5], @y[5], @t[7]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[6], @y[6], @t[7]
+       veor    @y[4], @y[4], @t[7]
+
+       veor    @t[7], @t[7], @t[5]
+       vext.8  @t[5], @t[5], @t[5], #12
+
+       @ multiplication by 0x0d
+       veor    @y[4], @y[4], @y[7]
+        veor   @t[7], @t[7], @t[6]             @ restore t[7]
+       veor    @y[7], @y[7], @t[4]
+       vext.8  @t[6], @t[6], @t[6], #12
+       veor    @y[2], @y[2], @t[0]
+       veor    @y[7], @y[7], @t[5]
+       vext.8  @t[7], @t[7], @t[7], #12
+       veor    @y[2], @y[2], @t[2]
+
+       veor    @y[3], @y[3], @y[1]
+       veor    @y[1], @y[1], @t[1]
+       veor    @y[0], @y[0], @t[0]
+       veor    @y[3], @y[3], @t[0]
+       veor    @y[1], @y[1], @t[5]
+       veor    @y[0], @y[0], @t[5]
+       vext.8  @t[0], @t[0], @t[0], #12
+       veor    @y[1], @y[1], @t[7]
+       veor    @y[0], @y[0], @t[6]
+       veor    @y[3], @y[3], @y[1]
+       veor    @y[4], @y[4], @t[1]
+       vext.8  @t[1], @t[1], @t[1], #12
+
+       veor    @y[7], @y[7], @t[7]
+       veor    @y[4], @y[4], @t[2]
+       veor    @y[5], @y[5], @t[2]
+       veor    @y[2], @y[2], @t[6]
+       veor    @t[6], @t[6], @t[3]             @ clobber t[6]
+       vext.8  @t[2], @t[2], @t[2], #12
+       veor    @y[4], @y[4], @y[7]
+       veor    @y[3], @y[3], @t[6]
+
+       veor    @y[6], @y[6], @t[6]
+       veor    @y[5], @y[5], @t[5]
+       vext.8  @t[5], @t[5], @t[5], #12
+       veor    @y[6], @y[6], @t[4]
+       vext.8  @t[4], @t[4], @t[4], #12
+       veor    @y[5], @y[5], @t[6]
+       veor    @y[6], @y[6], @t[7]
+       vext.8  @t[7], @t[7], @t[7], #12
+       veor    @t[6], @t[6], @t[3]             @ restore t[6]
+       vext.8  @t[3], @t[3], @t[3], #12
+
+       @ multiplication by 0x09
+       veor    @y[4], @y[4], @y[1]
+       veor    @t[1], @t[1], @y[1]             @ t[1]=y[1]
+       veor    @t[0], @t[0], @t[5]             @ clobber t[0]
+       vext.8  @t[6], @t[6], @t[6], #12
+       veor    @t[1], @t[1], @t[5]
+       veor    @y[3], @y[3], @t[0]
+       veor    @t[0], @t[0], @y[0]             @ t[0]=y[0]
+       veor    @t[1], @t[1], @t[6]
+       veor    @t[6], @t[6], @t[7]             @ clobber t[6]
+       veor    @y[4], @y[4], @t[1]
+       veor    @y[7], @y[7], @t[4]
+       veor    @y[6], @y[6], @t[3]
+       veor    @y[5], @y[5], @t[2]
+       veor    @t[4], @t[4], @y[4]             @ t[4]=y[4]
+       veor    @t[3], @t[3], @y[3]             @ t[3]=y[3]
+       veor    @t[5], @t[5], @y[5]             @ t[5]=y[5]
+       veor    @t[2], @t[2], @y[2]             @ t[2]=y[2]
+       veor    @t[3], @t[3], @t[7]
+       veor    @XMM[5], @t[5], @t[6]
+       veor    @XMM[6], @t[6], @y[6]           @ t[6]=y[6]
+       veor    @XMM[2], @t[2], @t[6]
+       veor    @XMM[7], @t[7], @y[7]           @ t[7]=y[7]
+
+       vmov    @XMM[0], @t[0]
+       vmov    @XMM[1], @t[1]
+       @ vmov  @XMM[2], @t[2]
+       vmov    @XMM[3], @t[3]
+       vmov    @XMM[4], @t[4]
+       @ vmov  @XMM[5], @t[5]
+       @ vmov  @XMM[6], @t[6]
+       @ vmov  @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+       @ multiplication by 0x05-0x00-0x04-0x00
+       vext.8  @t[0], @x[0], @x[0], #8
+       vext.8  @t[6], @x[6], @x[6], #8
+       vext.8  @t[7], @x[7], @x[7], #8
+       veor    @t[0], @t[0], @x[0]
+       vext.8  @t[1], @x[1], @x[1], #8
+       veor    @t[6], @t[6], @x[6]
+       vext.8  @t[2], @x[2], @x[2], #8
+       veor    @t[7], @t[7], @x[7]
+       vext.8  @t[3], @x[3], @x[3], #8
+       veor    @t[1], @t[1], @x[1]
+       vext.8  @t[4], @x[4], @x[4], #8
+       veor    @t[2], @t[2], @x[2]
+       vext.8  @t[5], @x[5], @x[5], #8
+       veor    @t[3], @t[3], @x[3]
+       veor    @t[4], @t[4], @x[4]
+       veor    @t[5], @t[5], @x[5]
+
+        veor   @x[0], @x[0], @t[6]
+        veor   @x[1], @x[1], @t[6]
+        veor   @x[2], @x[2], @t[0]
+        veor   @x[4], @x[4], @t[2]
+        veor   @x[3], @x[3], @t[1]
+        veor   @x[1], @x[1], @t[7]
+        veor   @x[2], @x[2], @t[7]
+        veor   @x[4], @x[4], @t[6]
+        veor   @x[5], @x[5], @t[3]
+        veor   @x[3], @x[3], @t[6]
+        veor   @x[6], @x[6], @t[4]
+        veor   @x[4], @x[4], @t[7]
+        veor   @x[5], @x[5], @t[7]
+        veor   @x[7], @x[7], @t[5]
+___
+       &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+       vshr.u64        $t, $b, #$n
+       veor            $t, $t, $a
+       vand            $t, $t, $mask
+       veor            $a, $a, $t
+       vshl.u64        $t, $t, #$n
+       veor            $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+       vshr.u64        $t0, $b0, #$n
+        vshr.u64       $t1, $b1, #$n
+       veor            $t0, $t0, $a0
+        veor           $t1, $t1, $a1
+       vand            $t0, $t0, $mask
+        vand           $t1, $t1, $mask
+       veor            $a0, $a0, $t0
+       vshl.u64        $t0, $t0, #$n
+        veor           $a1, $a1, $t1
+        vshl.u64       $t1, $t1, #$n
+       veor            $b0, $b0, $t0
+        veor           $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+       vmov.i8 $t0,#0x55                       @ compose .LBS0
+       vmov.i8 $t1,#0x33                       @ compose .LBS1
+___
+       &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+       &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+       vmov.i8 $t0,#0x0f                       @ compose .LBS2
+___
+       &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+       &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+       &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+       &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax        unified         @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code   32
+#endif
+
+.fpu   neon
+
+.type  _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+       adr     $const,_bsaes_decrypt8
+       vldmia  $key!, {@XMM[9]}                @ round 0 key
+       add     $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+       vldmia  $const!, {@XMM[8]}              @ .LM0ISR
+       veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+       veor    @XMM[11], @XMM[1], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+       veor    @XMM[12], @XMM[2], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+       veor    @XMM[13], @XMM[3], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+       veor    @XMM[14], @XMM[4], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+       veor    @XMM[15], @XMM[5], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+       veor    @XMM[10], @XMM[6], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+       veor    @XMM[11], @XMM[7], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       sub     $rounds,$rounds,#1
+       b       .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+       &InvSbox        (@XMM[0..7, 8..15]);
+$code.=<<___;
+       subs    $rounds,$rounds,#1
+       bcc     .Ldec_done
+___
+       &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+       vldmia  $const, {@XMM[12]}              @ .LISR
+       ite     eq                              @ Thumb2 thing, sanity check in ARM
+       addeq   $const,$const,#0x10
+       bne     .Ldec_loop
+       vldmia  $const, {@XMM[12]}              @ .LISRM0
+       b       .Ldec_loop
+.align 4
+.Ldec_done:
+___
+       &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+       vldmia  $key, {@XMM[8]}                 @ last round key
+       veor    @XMM[6], @XMM[6], @XMM[8]
+       veor    @XMM[4], @XMM[4], @XMM[8]
+       veor    @XMM[2], @XMM[2], @XMM[8]
+       veor    @XMM[7], @XMM[7], @XMM[8]
+       veor    @XMM[3], @XMM[3], @XMM[8]
+       veor    @XMM[5], @XMM[5], @XMM[8]
+       veor    @XMM[0], @XMM[0], @XMM[8]
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       bx      lr
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type  _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:       @ InvShiftRows constants
+       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:                @ ShiftRows constants
+       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+       .quad   0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size  _bsaes_const,.-_bsaes_const
+
+.type  _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+       adr     $const,_bsaes_encrypt8
+       vldmia  $key!, {@XMM[9]}                @ round 0 key
+       sub     $const,$const,#_bsaes_encrypt8-.LM0SR
+
+       vldmia  $const!, {@XMM[8]}              @ .LM0SR
+_bsaes_encrypt8_alt:
+       veor    @XMM[10], @XMM[0], @XMM[9]      @ xor with round0 key
+       veor    @XMM[11], @XMM[1], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+       veor    @XMM[12], @XMM[2], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+       veor    @XMM[13], @XMM[3], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+       veor    @XMM[14], @XMM[4], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+       veor    @XMM[15], @XMM[5], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+       veor    @XMM[10], @XMM[6], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+       veor    @XMM[11], @XMM[7], @XMM[9]
+        vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+        vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+        vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       sub     $rounds,$rounds,#1
+       b       .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+       &Sbox           (@XMM[0..7, 8..15]);
+$code.=<<___;
+       subs    $rounds,$rounds,#1
+       bcc     .Lenc_done
+___
+       &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+       vldmia  $const, {@XMM[12]}              @ .LSR
+       ite     eq                              @ Thumb2 thing, samity check in ARM
+       addeq   $const,$const,#0x10
+       bne     .Lenc_loop
+       vldmia  $const, {@XMM[12]}              @ .LSRM0
+       b       .Lenc_loop
+.align 4
+.Lenc_done:
+___
+       # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+       &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+       vldmia  $key, {@XMM[8]}                 @ last round key
+       veor    @XMM[4], @XMM[4], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[8]
+       veor    @XMM[3], @XMM[3], @XMM[8]
+       veor    @XMM[7], @XMM[7], @XMM[8]
+       veor    @XMM[2], @XMM[2], @XMM[8]
+       veor    @XMM[5], @XMM[5], @XMM[8]
+       veor    @XMM[0], @XMM[0], @XMM[8]
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       bx      lr
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+       &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+       @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+       vmov    @x[2], @x[0]
+       vmov    @x[3], @x[1]
+___
+       #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+       &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+       @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+       vmov    @x[4], @x[0]
+       vmov    @x[6], @x[2]
+       vmov    @x[5], @x[1]
+       vmov    @x[7], @x[3]
+___
+       &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
+       &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type  _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+       adr     $const,_bsaes_key_convert
+       vld1.8  {@XMM[7]},  [$inp]!             @ load round 0 key
+       sub     $const,$const,#_bsaes_key_convert-.LM0
+       vld1.8  {@XMM[15]}, [$inp]!             @ load round 1 key
+
+       vmov.i8 @XMM[8],  #0x01                 @ bit masks
+       vmov.i8 @XMM[9],  #0x02
+       vmov.i8 @XMM[10], #0x04
+       vmov.i8 @XMM[11], #0x08
+       vmov.i8 @XMM[12], #0x10
+       vmov.i8 @XMM[13], #0x20
+       vldmia  $const, {@XMM[14]}              @ .LM0
+
+#ifdef __ARMEL__
+       vrev32.8        @XMM[7],  @XMM[7]
+       vrev32.8        @XMM[15], @XMM[15]
+#endif
+       sub     $rounds,$rounds,#1
+       vstmia  $out!, {@XMM[7]}                @ save round 0 key
+       b       .Lkey_loop
+
+.align 4
+.Lkey_loop:
+       vtbl.8  `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+       vtbl.8  `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+       vmov.i8 @XMM[6],  #0x40
+       vmov.i8 @XMM[15], #0x80
+
+       vtst.8  @XMM[0], @XMM[7], @XMM[8]
+       vtst.8  @XMM[1], @XMM[7], @XMM[9]
+       vtst.8  @XMM[2], @XMM[7], @XMM[10]
+       vtst.8  @XMM[3], @XMM[7], @XMM[11]
+       vtst.8  @XMM[4], @XMM[7], @XMM[12]
+       vtst.8  @XMM[5], @XMM[7], @XMM[13]
+       vtst.8  @XMM[6], @XMM[7], @XMM[6]
+       vtst.8  @XMM[7], @XMM[7], @XMM[15]
+       vld1.8  {@XMM[15]}, [$inp]!             @ load next round key
+       vmvn    @XMM[0], @XMM[0]                @ "pnot"
+       vmvn    @XMM[1], @XMM[1]
+       vmvn    @XMM[5], @XMM[5]
+       vmvn    @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+       vrev32.8        @XMM[15], @XMM[15]
+#endif
+       subs    $rounds,$rounds,#1
+       vstmia  $out!,{@XMM[0]-@XMM[7]}         @ write bit-sliced round key
+       bne     .Lkey_loop
+
+       vmov.i8 @XMM[7],#0x63                   @ compose .L63
+       @ don't save last round key
+       bx      lr
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) {               # following four functions are unsupported interface
+                       # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type  bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       ldr     r5,[$inp,#240]                  @ pass rounds
+       mov     r4,$inp                         @ pass key
+       mov     r12,$out                        @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type  bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Lenc128_loop:
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+       mov     r4,$key                         @ pass the key
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5,#10                          @ pass rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+
+       bl      _bsaes_encrypt8
+
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       subs    $len,$len,#0x80
+       vst1.8  {@XMM[5]}, [$out]!
+       bhi     .Lenc128_loop
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type  bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       ldr     r5,[$inp,#240]                  @ pass rounds
+       mov     r4,$inp                         @ pass key
+       mov     r12,$out                        @ pass key schedule
+       bl      _bsaes_key_convert
+       vldmia  $out, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  $out, {@XMM[7]}
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type  bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+       stmdb   sp!,{r4-r6,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+.Ldec128_loop:
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+       mov     r4,$key                         @ pass the key
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5,#10                          @ pass rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]!
+
+       bl      _bsaes_decrypt8
+
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       subs    $len,$len,#0x80
+       vst1.8  {@XMM[5]}, [$out]!
+       bhi     .Ldec128_loop
+
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r6,pc}
+.size  bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global        bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef        __KERNEL__
+       cmp     $len, #128
+#ifndef        __thumb__
+       blo     AES_cbc_encrypt
+#else
+       bhs     1f
+       b       AES_cbc_encrypt
+1:
+#endif
+#endif
+
+       @ it is up to the caller to make sure we are called with enc == 0
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     $ivp, [ip]                      @ IV is 1st arg on the stack
+       mov     $len, $len, lsr#4               @ len in 16 byte blocks
+       sub     sp, #0x10                       @ scratch space to carry over the IV
+       mov     $fp, sp                         @ save sp
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       add     r12, #`128-32`                  @ sifze of bit-slices key schedule
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12                         @ sp is $keysched
+       bl      _bsaes_key_convert
+       vldmia  $keysched, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  $keysched, {@XMM[7]}
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, $key, #248
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12, {@XMM[15]}                 @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+       vld1.8  {@XMM[15]}, [$ivp]              @ load IV
+       b       .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+       subs    $len, $len, #0x8
+       bmi     .Lcbc_dec_loop_finish
+
+       vld1.8  {@XMM[0]-@XMM[1]}, [$inp]!      @ load input
+       vld1.8  {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, $keysched                   @ pass the key
+#else
+       add     r4, $key, #248
+#endif
+       vld1.8  {@XMM[4]-@XMM[5]}, [$inp]!
+       mov     r5, $rounds
+       vld1.8  {@XMM[6]-@XMM[7]}, [$inp]
+       sub     $inp, $inp, #0x60
+       vstmia  $fp, {@XMM[15]}                 @ put aside IV
+
+       bl      _bsaes_decrypt8
+
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[14]-@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[3], @XMM[3], @XMM[13]
+       vst1.8  {@XMM[6]}, [$out]!
+       veor    @XMM[5], @XMM[5], @XMM[14]
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       vst1.8  {@XMM[5]}, [$out]!
+
+       b       .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+       adds    $len, $len, #8
+       beq     .Lcbc_dec_done
+
+       vld1.8  {@XMM[0]}, [$inp]!              @ load input
+       cmp     $len, #2
+       blo     .Lcbc_dec_one
+       vld1.8  {@XMM[1]}, [$inp]!
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       mov     r4, $keysched                   @ pass the key
+#else
+       add     r4, $key, #248
+#endif
+       mov     r5, $rounds
+       vstmia  $fp, {@XMM[15]}                 @ put aside IV
+       beq     .Lcbc_dec_two
+       vld1.8  {@XMM[2]}, [$inp]!
+       cmp     $len, #4
+       blo     .Lcbc_dec_three
+       vld1.8  {@XMM[3]}, [$inp]!
+       beq     .Lcbc_dec_four
+       vld1.8  {@XMM[4]}, [$inp]!
+       cmp     $len, #6
+       blo     .Lcbc_dec_five
+       vld1.8  {@XMM[5]}, [$inp]!
+       beq     .Lcbc_dec_six
+       vld1.8  {@XMM[6]}, [$inp]!
+       sub     $inp, $inp, #0x70
+
+       bl      _bsaes_decrypt8
+
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]-@XMM[13]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[3], @XMM[3], @XMM[13]
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       vst1.8  {@XMM[3]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+       sub     $inp, $inp, #0x60
+       bl      _bsaes_decrypt8
+       vldmia  $fp,{@XMM[14]}                  @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[12]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[7], @XMM[7], @XMM[12]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       vst1.8  {@XMM[7]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+       sub     $inp, $inp, #0x50
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]-@XMM[11]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor    @XMM[2], @XMM[2], @XMM[11]
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       vst1.8  {@XMM[2]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+       sub     $inp, $inp, #0x40
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[10]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[4], @XMM[4], @XMM[10]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       vst1.8  {@XMM[4]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+       sub     $inp, $inp, #0x30
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]-@XMM[9]}, [$inp]!      @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[15]}, [$inp]!
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       veor    @XMM[6], @XMM[6], @XMM[9]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       vst1.8  {@XMM[6]}, [$out]!
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+       sub     $inp, $inp, #0x20
+       bl      _bsaes_decrypt8
+       vldmia  $fp, {@XMM[14]}                 @ reload IV
+       vld1.8  {@XMM[8]}, [$inp]!              @ reload input
+       veor    @XMM[0], @XMM[0], @XMM[14]      @ ^= IV
+       vld1.8  {@XMM[15]}, [$inp]!             @ reload input
+       veor    @XMM[1], @XMM[1], @XMM[8]
+       vst1.8  {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       b       .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+       sub     $inp, $inp, #0x10
+       mov     $rounds, $out                   @ save original out pointer
+       mov     $out, $fp                       @ use the iv scratch space as out buffer
+       mov     r2, $key
+       vmov    @XMM[4],@XMM[15]                @ just in case ensure that IV
+       vmov    @XMM[5],@XMM[0]                 @ and input are preserved
+       bl      AES_decrypt
+       vld1.8  {@XMM[0]}, [$fp,:64]            @ load result
+       veor    @XMM[0], @XMM[0], @XMM[4]       @ ^= IV
+       vmov    @XMM[15], @XMM[5]               @ @XMM[5] holds input
+       vst1.8  {@XMM[0]}, [$rounds]            @ write output
+
+.Lcbc_dec_done:
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+.Lcbc_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          $keysched!, {q0-q1}
+       cmp             $keysched, $fp
+       bne             .Lcbc_dec_bzero
+#endif
+
+       mov     sp, $fp
+       add     sp, #0x10                       @ add sp,$fp,#0x10 is no good for thumb
+       vst1.8  {@XMM[15]}, [$ivp]              @ return IV
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6";      # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern        AES_encrypt
+.global        bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+       cmp     $len, #8                        @ use plain AES for
+       blo     .Lctr_enc_short                 @ small sizes
+
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}
+       VFP_ABI_PUSH
+       ldr     $ctr, [ip]                      @ ctr is 1st arg on the stack
+       sub     sp, sp, #0x10                   @ scratch space to carry over the ctr
+       mov     $fp, sp                         @ save sp
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       add     r12, #`128-32`                  @ size of bit-sliced key schedule
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12                         @ sp is $keysched
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+       vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+       add     $ctr, $const, #.LREVM0SR-.LM0   @ borrow $ctr
+       vldmia  $keysched, {@XMM[4]}            @ load round0 key
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       @ populate the key schedule
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7],@XMM[7],@XMM[15]        @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+
+.align 2
+0:     add     r12, $key, #248
+       vld1.8  {@XMM[0]}, [$ctr]               @ load counter
+       adrl    $ctr, .LREVM0SR                 @ borrow $ctr
+       vldmia  r12, {@XMM[4]}                  @ load round0 key
+       sub     sp, #0x10                       @ place for adjusted round0 key
+#endif
+
+       vmov.i32        @XMM[8],#1              @ compose 1<<96
+       veor            @XMM[9],@XMM[9],@XMM[9]
+       vrev32.8        @XMM[0],@XMM[0]
+       vext.8          @XMM[8],@XMM[9],@XMM[8],#4
+       vrev32.8        @XMM[4],@XMM[4]
+       vadd.u32        @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+       vstmia  $keysched, {@XMM[4]}            @ save adjusted round0 key
+       b       .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+       vadd.u32        @XMM[10], @XMM[8], @XMM[9]      @ compose 3<<96
+       vadd.u32        @XMM[1], @XMM[0], @XMM[8]       @ +1
+       vadd.u32        @XMM[2], @XMM[0], @XMM[9]       @ +2
+       vadd.u32        @XMM[3], @XMM[0], @XMM[10]      @ +3
+       vadd.u32        @XMM[4], @XMM[1], @XMM[10]
+       vadd.u32        @XMM[5], @XMM[2], @XMM[10]
+       vadd.u32        @XMM[6], @XMM[3], @XMM[10]
+       vadd.u32        @XMM[7], @XMM[4], @XMM[10]
+       vadd.u32        @XMM[10], @XMM[5], @XMM[10]     @ next counter
+
+       @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+       @ to flip byte order in 32-bit counter
+
+       vldmia          $keysched, {@XMM[9]}            @ load round0 key
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, $keysched, #0x10            @ pass next round key
+#else
+       add             r4, $key, #`248+16`
+#endif
+       vldmia          $ctr, {@XMM[8]}                 @ .LREVM0SR
+       mov             r5, $rounds                     @ pass rounds
+       vstmia          $fp, {@XMM[10]}                 @ save next counter
+       sub             $const, $ctr, #.LREVM0SR-.LSR   @ pass constants
+
+       bl              _bsaes_encrypt8_alt
+
+       subs            $len, $len, #8
+       blo             .Lctr_enc_loop_done
+
+       vld1.8          {@XMM[8]-@XMM[9]}, [$inp]!      @ load input
+       vld1.8          {@XMM[10]-@XMM[11]}, [$inp]!
+       veor            @XMM[0], @XMM[8]
+       veor            @XMM[1], @XMM[9]
+       vld1.8          {@XMM[12]-@XMM[13]}, [$inp]!
+       veor            @XMM[4], @XMM[10]
+       veor            @XMM[6], @XMM[11]
+       vld1.8          {@XMM[14]-@XMM[15]}, [$inp]!
+       veor            @XMM[3], @XMM[12]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!      @ write output
+       veor            @XMM[7], @XMM[13]
+       veor            @XMM[2], @XMM[14]
+       vst1.8          {@XMM[4]}, [$out]!
+       veor            @XMM[5], @XMM[15]
+       vst1.8          {@XMM[6]}, [$out]!
+       vmov.i32        @XMM[8], #1                     @ compose 1<<96
+       vst1.8          {@XMM[3]}, [$out]!
+       veor            @XMM[9], @XMM[9], @XMM[9]
+       vst1.8          {@XMM[7]}, [$out]!
+       vext.8          @XMM[8], @XMM[9], @XMM[8], #4
+       vst1.8          {@XMM[2]}, [$out]!
+       vadd.u32        @XMM[9],@XMM[8],@XMM[8]         @ compose 2<<96
+       vst1.8          {@XMM[5]}, [$out]!
+       vldmia          $fp, {@XMM[0]}                  @ load counter
+
+       bne             .Lctr_enc_loop
+       b               .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+       add             $len, $len, #8
+       vld1.8          {@XMM[8]}, [$inp]!      @ load input
+       veor            @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!      @ write output
+       cmp             $len, #2
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[9]}, [$inp]!
+       veor            @XMM[1], @XMM[9]
+       vst1.8          {@XMM[1]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[10]}, [$inp]!
+       veor            @XMM[4], @XMM[10]
+       vst1.8          {@XMM[4]}, [$out]!
+       cmp             $len, #4
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[11]}, [$inp]!
+       veor            @XMM[6], @XMM[11]
+       vst1.8          {@XMM[6]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[12]}, [$inp]!
+       veor            @XMM[3], @XMM[12]
+       vst1.8          {@XMM[3]}, [$out]!
+       cmp             $len, #6
+       blo             .Lctr_enc_done
+       vld1.8          {@XMM[13]}, [$inp]!
+       veor            @XMM[7], @XMM[13]
+       vst1.8          {@XMM[7]}, [$out]!
+       beq             .Lctr_enc_done
+       vld1.8          {@XMM[14]}, [$inp]
+       veor            @XMM[2], @XMM[14]
+       vst1.8          {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifndef        BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:                       @ wipe key schedule [if any]
+       vstmia          $keysched!, {q0-q1}
+       cmp             $keysched, $fp
+       bne             .Lctr_enc_bzero
+#else
+       vstmia          $keysched, {q0-q1}
+#endif
+
+       mov     sp, $fp
+       add     sp, #0x10               @ add sp,$fp,#0x10 is no good for thumb
+       VFP_ABI_POP
+       ldmia   sp!, {r4-r10, pc}       @ return
+
+.align 4
+.Lctr_enc_short:
+       ldr     ip, [sp]                @ ctr pointer is passed on stack
+       stmdb   sp!, {r4-r8, lr}
+
+       mov     r4, $inp                @ copy arguments
+       mov     r5, $out
+       mov     r6, $len
+       mov     r7, $key
+       ldr     r8, [ip, #12]           @ load counter LSW
+       vld1.8  {@XMM[1]}, [ip]         @ load whole counter value
+#ifdef __ARMEL__
+       rev     r8, r8
+#endif
+       sub     sp, sp, #0x10
+       vst1.8  {@XMM[1]}, [sp,:64]     @ copy counter value
+       sub     sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+       add     r0, sp, #0x10           @ input counter value
+       mov     r1, sp                  @ output on the stack
+       mov     r2, r7                  @ key
+
+       bl      AES_encrypt
+
+       vld1.8  {@XMM[0]}, [r4]!        @ load input
+       vld1.8  {@XMM[1]}, [sp,:64]     @ load encrypted counter
+       add     r8, r8, #1
+#ifdef __ARMEL__
+       rev     r0, r8
+       str     r0, [sp, #0x1c]         @ next counter value
+#else
+       str     r8, [sp, #0x1c]         @ next counter value
+#endif
+       veor    @XMM[0],@XMM[0],@XMM[1]
+       vst1.8  {@XMM[0]}, [r5]!        @ store output
+       subs    r6, r6, #1
+       bne     .Lctr_enc_short_loop
+
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+       vstmia          sp!, {q0-q1}
+
+       ldmia   sp!, {r4-r8, pc}
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2,
+#      const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6";                # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future $fp
+
+       mov     $inp, r0
+       mov     $out, r1
+       mov     $len, r2
+       mov     $key, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0,sp                           @ pointer to initial tweak
+#endif
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+       mov     $fp, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+       sub     r12, #`32+16`                   @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+       vstmia  r12, {@XMM[7]}                  @ save last round key
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       veor    @XMM[7], @XMM[7], @XMM[15]      @ fix up last round key
+       vstmia  r12, {@XMM[7]}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+
+       vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+       adr     $magic, .Lxts_magic
+
+       subs    $len, #0x80
+       blo     .Lxts_enc_short
+       b       .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+       vst1.64         {@XMM[15]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       veor            @XMM[8], @XMM[8], @T[0]
+       vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       veor            @XMM[7], @XMM[7], @XMM[15]
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[2], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       veor            @XMM[13], @XMM[5], @XMM[15]
+       vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       subs            $len, #0x80
+       bpl             .Lxts_enc_loop
+
+.Lxts_enc_short:
+       adds            $len, #0x70
+       bmi             .Lxts_enc_done
+
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+       subs            $len, #0x10
+       bmi             .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       sub             $len, #0x10
+       vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+
+       vld1.8          {@XMM[6]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vld1.64         {@XMM[14]}, [r0,:128]!
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[2], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       vst1.8          {@XMM[12]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+       vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[5], @XMM[5], @XMM[13]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+       .quad   1, 0x87
+
+.align 5
+.Lxts_enc_5:
+       vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[4], @XMM[4], @XMM[12]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       veor            @XMM[10], @XMM[3], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       vst1.8          {@XMM[10]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+       vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[3], @XMM[3], @XMM[11]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[6], @XMM[11]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+       vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[2], @XMM[2], @XMM[10]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       vld1.64         {@XMM[10]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[4], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       vst1.8          {@XMM[8]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+       vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[1], @XMM[1], @XMM[9]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_encrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                         @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!
+       mov             $fp, r4
+
+       vmov            @XMM[8], @XMM[9]                @ next round tweak
+
+.Lxts_enc_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            $len, #0x10
+       beq             .Lxts_enc_ret
+       sub             r6, $out, #0x10
+
+.Lxts_enc_steal:
+       ldrb            r0, [$inp], #1
+       ldrb            r1, [$out, #-0x10]
+       strb            r0, [$out, #-0x10]
+       strb            r1, [$out], #1
+
+       subs            $len, #1
+       bhi             .Lxts_enc_steal
+
+       vld1.8          {@XMM[0]}, [r6]
+       mov             r0, sp
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                 @ preserve fp
+
+       bl              AES_encrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [r6]
+       mov             $fp, r4
+#endif
+
+.Lxts_enc_ret:
+       bic             r0, $fp, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_enc_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_enc_bzero
+
+       mov             sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {@XMM[8]}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+       mov     ip, sp
+       stmdb   sp!, {r4-r10, lr}               @ 0x20
+       VFP_ABI_PUSH
+       mov     r6, sp                          @ future $fp
+
+       mov     $inp, r0
+       mov     $out, r1
+       mov     $len, r2
+       mov     $key, r3
+
+       sub     r0, sp, #0x10                   @ 0x10
+       bic     r0, #0xf                        @ align at 16 bytes
+       mov     sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+       ldr     r0, [ip]                        @ pointer to input tweak
+#else
+       @ generate initial tweak
+       ldr     r0, [ip, #4]                    @ iv[]
+       mov     r1, sp
+       ldr     r2, [ip, #0]                    @ key2
+       bl      AES_encrypt
+       mov     r0, sp                          @ pointer to initial tweak
+#endif
+
+       ldr     $rounds, [$key, #240]           @ get # of rounds
+       mov     $fp, r6
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       @ allocate the key schedule on the stack
+       sub     r12, sp, $rounds, lsl#7         @ 128 bytes per inner round key
+       @ add   r12, #`128-32`                  @ size of bit-sliced key schedule
+       sub     r12, #`32+16`                   @ place for tweak[9]
+
+       @ populate the key schedule
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       mov     sp, r12
+       add     r12, #0x90                      @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, sp, #0x90
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+#else
+       ldr     r12, [$key, #244]
+       eors    r12, #1
+       beq     0f
+
+       str     r12, [$key, #244]
+       mov     r4, $key                        @ pass key
+       mov     r5, $rounds                     @ pass # of rounds
+       add     r12, $key, #248                 @ pass key schedule
+       bl      _bsaes_key_convert
+       add     r4, $key, #248
+       vldmia  r4, {@XMM[6]}
+       vstmia  r12,  {@XMM[15]}                @ save last round key
+       veor    @XMM[7], @XMM[7], @XMM[6]       @ fix up round 0 key
+       vstmia  r4, {@XMM[7]}
+
+.align 2
+0:     sub     sp, #0x90                       @ place for tweak[9]
+#endif
+       vld1.8  {@XMM[8]}, [r0]                 @ initial tweak
+       adr     $magic, .Lxts_magic
+
+       tst     $len, #0xf                      @ if not multiple of 16
+       it      ne                              @ Thumb2 thing, sanity check in ARM
+       subne   $len, #0x10                     @ subtract another 16 bytes
+       subs    $len, #0x80
+
+       blo     .Lxts_dec_short
+       b       .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       vadd.u64        @XMM[8], @XMM[15], @XMM[15]
+       vst1.64         {@XMM[15]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       veor            @XMM[8], @XMM[8], @T[0]
+       vst1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       vld1.8          {@XMM[6]-@XMM[7]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       veor            @XMM[7], @XMM[7], @XMM[15]
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vld1.64         {@XMM[14]-@XMM[15]}, [r0,:128]!
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[3], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       veor            @XMM[13], @XMM[5], @XMM[15]
+       vst1.8          {@XMM[12]-@XMM[13]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+
+       subs            $len, #0x80
+       bpl             .Lxts_dec_loop
+
+.Lxts_dec_short:
+       adds            $len, #0x70
+       bmi             .Lxts_dec_done
+
+       vldmia          $magic, {$twmask}       @ load XTS magic
+       vshr.s64        @T[0], @XMM[8], #63
+       mov             r0, sp
+       vand            @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+       vadd.u64        @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+       vst1.64         {@XMM[$i-1]}, [r0,:128]!
+       vswp            `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+       vshr.s64        @T[1], @XMM[$i], #63
+       veor            @XMM[$i], @XMM[$i], @T[0]
+       vand            @T[1], @T[1], $twmask
+___
+       @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+       vld1.8          {@XMM[$i-10]}, [$inp]!
+       subs            $len, #0x10
+       bmi             .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+       veor            @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+       sub             $len, #0x10
+       vst1.64         {@XMM[15]}, [r0,:128]           @ next round tweak
+
+       vld1.8          {@XMM[6]}, [$inp]!
+       veor            @XMM[5], @XMM[5], @XMM[13]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[6], @XMM[6], @XMM[14]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vld1.64         {@XMM[14]}, [r0,:128]!
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       veor            @XMM[12], @XMM[3], @XMM[14]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+       vst1.8          {@XMM[12]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+       vst1.64         {@XMM[14]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[4], @XMM[4], @XMM[12]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[5], @XMM[5], @XMM[13]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]-@XMM[13]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       veor            @XMM[11], @XMM[7], @XMM[13]
+       vst1.8          {@XMM[10]-@XMM[11]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+       vst1.64         {@XMM[13]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[3], @XMM[3], @XMM[11]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[4], @XMM[4], @XMM[12]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       vld1.64         {@XMM[12]}, [r0,:128]!
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       veor            @XMM[10], @XMM[2], @XMM[12]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+       vst1.8          {@XMM[10]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+       vst1.64         {@XMM[12]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[2], @XMM[2], @XMM[10]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[3], @XMM[3], @XMM[11]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+       vld1.64         {@XMM[10]-@XMM[11]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       veor            @XMM[9], @XMM[4], @XMM[11]
+       vst1.8          {@XMM[8]-@XMM[9]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+       vst1.64         {@XMM[11]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[1], @XMM[1], @XMM[9]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[2], @XMM[2], @XMM[10]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       vld1.64         {@XMM[10]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       veor            @XMM[8], @XMM[6], @XMM[10]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+       vst1.8          {@XMM[8]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+       vst1.64         {@XMM[10]}, [r0,:128]           @ next round tweak
+
+       veor            @XMM[0], @XMM[0], @XMM[8]
+#ifndef        BSAES_ASM_EXTENDED_KEY
+       add             r4, sp, #0x90                   @ pass key schedule
+#else
+       add             r4, $key, #248                  @ pass key schedule
+#endif
+       veor            @XMM[1], @XMM[1], @XMM[9]
+       mov             r5, $rounds                     @ pass rounds
+       mov             r0, sp
+
+       bl              _bsaes_decrypt8
+
+       vld1.64         {@XMM[8]-@XMM[9]}, [r0,:128]!
+       veor            @XMM[0], @XMM[0], @XMM[ 8]
+       veor            @XMM[1], @XMM[1], @XMM[ 9]
+       vst1.8          {@XMM[0]-@XMM[1]}, [$out]!
+
+       vld1.64         {@XMM[8]}, [r0,:128]            @ next round tweak
+       b               .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                         @ preserve fp
+       mov             r5, $magic                      @ preserve magic
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [$out]!
+       mov             $fp, r4
+       mov             $magic, r5
+
+       vmov            @XMM[8], @XMM[9]                @ next round tweak
+
+.Lxts_dec_done:
+#ifndef        XTS_CHAIN_TWEAK
+       adds            $len, #0x10
+       beq             .Lxts_dec_ret
+
+       @ calculate one round of extra tweak for the stolen ciphertext
+       vldmia          $magic, {$twmask}
+       vshr.s64        @XMM[6], @XMM[8], #63
+       vand            @XMM[6], @XMM[6], $twmask
+       vadd.u64        @XMM[9], @XMM[8], @XMM[8]
+       vswp            `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+       veor            @XMM[9], @XMM[9], @XMM[6]
+
+       @ perform the final decryption with the last tweak value
+       vld1.8          {@XMM[0]}, [$inp]!
+       mov             r0, sp
+       veor            @XMM[0], @XMM[0], @XMM[9]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+       mov             r4, $fp                 @ preserve fp
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[9]
+       vst1.8          {@XMM[0]}, [$out]
+
+       mov             r6, $out
+.Lxts_dec_steal:
+       ldrb            r1, [$out]
+       ldrb            r0, [$inp], #1
+       strb            r1, [$out, #0x10]
+       strb            r0, [$out], #1
+
+       subs            $len, #1
+       bhi             .Lxts_dec_steal
+
+       vld1.8          {@XMM[0]}, [r6]
+       mov             r0, sp
+       veor            @XMM[0], @XMM[8]
+       mov             r1, sp
+       vst1.8          {@XMM[0]}, [sp,:128]
+       mov             r2, $key
+
+       bl              AES_decrypt
+
+       vld1.8          {@XMM[0]}, [sp,:128]
+       veor            @XMM[0], @XMM[0], @XMM[8]
+       vst1.8          {@XMM[0]}, [r6]
+       mov             $fp, r4
+#endif
+
+.Lxts_dec_ret:
+       bic             r0, $fp, #0xf
+       vmov.i32        q0, #0
+       vmov.i32        q1, #0
+#ifdef XTS_CHAIN_TWEAK
+       ldr             r1, [$fp, #0x20+VFP_ABI_FRAME]  @ chain tweak
+#endif
+.Lxts_dec_bzero:                               @ wipe key schedule [if any]
+       vstmia          sp!, {q0-q1}
+       cmp             sp, r0
+       bne             .Lxts_dec_bzero
+
+       mov             sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+       vst1.8          {@XMM[8]}, [r1]
+#endif
+       VFP_ABI_POP
+       ldmia           sp!, {r4-r10, pc}       @ return
+
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+        last if (!s/^#/@/ and !/^$/);
+        print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
index 1a7024b413511d6742f6fb928407603c648c77e3..c38b58c8020215f54af8140e97de8467ad3c3c16 100644 (file)
@@ -24,6 +24,7 @@ generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
 generic-y += siginfo.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += socket.h
 generic-y += sockios.h
index fcc1b5bf6979cafd4850ff88b3cf8d21bb6791a5..5c228516057552b6eca4f6f5a54f2a3f9733fd09 100644 (file)
 #define put_byte_3      lsl #0
 #endif
 
+/* Select code for any configuration running in BE8 mode */
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define ARM_BE8(code...) code
+#else
+#define ARM_BE8(code...)
+#endif
+
 /*
  * Data preload for architectures that support it
  */
index da1c77d39327963ab10e633aeb8809aac7da2dec..62d2cb53b06935aed4430bf33c6a127239801ff1 100644 (file)
@@ -12,6 +12,7 @@
 #define __ASM_ARM_ATOMIC_H
 
 #include <linux/compiler.h>
+#include <linux/prefetch.h>
 #include <linux/types.h>
 #include <linux/irqflags.h>
 #include <asm/barrier.h>
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
        unsigned long tmp;
        int result;
 
+       prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic_add\n"
 "1:    ldrex   %0, [%3]\n"
 "      add     %0, %0, %4\n"
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
        unsigned long tmp;
        int result;
 
+       prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic_sub\n"
 "1:    ldrex   %0, [%3]\n"
 "      sub     %0, %0, %4\n"
@@ -114,7 +117,8 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 
 static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 {
-       unsigned long oldval, res;
+       int oldval;
+       unsigned long res;
 
        smp_mb();
 
@@ -134,21 +138,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
        return oldval;
 }
 
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long tmp, tmp2;
-
-       __asm__ __volatile__("@ atomic_clear_mask\n"
-"1:    ldrex   %0, [%3]\n"
-"      bic     %0, %0, %4\n"
-"      strex   %1, %0, [%3]\n"
-"      teq     %1, #0\n"
-"      bne     1b"
-       : "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr)
-       : "r" (addr), "Ir" (mask)
-       : "cc");
-}
-
 #else /* ARM_ARCH_6 */
 
 #ifdef CONFIG_SMP
@@ -197,15 +186,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
        return ret;
 }
 
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long flags;
-
-       raw_local_irq_save(flags);
-       *addr &= ~mask;
-       raw_local_irq_restore(flags);
-}
-
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -238,15 +218,15 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 
 #ifndef CONFIG_GENERIC_ATOMIC64
 typedef struct {
-       u64 __aligned(8) counter;
+       long long counter;
 } atomic64_t;
 
 #define ATOMIC64_INIT(i) { (i) }
 
 #ifdef CONFIG_ARM_LPAE
-static inline u64 atomic64_read(const atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
 {
-       u64 result;
+       long long result;
 
        __asm__ __volatile__("@ atomic64_read\n"
 "      ldrd    %0, %H0, [%1]"
@@ -257,7 +237,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
        return result;
 }
 
-static inline void atomic64_set(atomic64_t *v, u64 i)
+static inline void atomic64_set(atomic64_t *v, long long i)
 {
        __asm__ __volatile__("@ atomic64_set\n"
 "      strd    %2, %H2, [%1]"
@@ -266,9 +246,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
        );
 }
 #else
-static inline u64 atomic64_read(const atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
 {
-       u64 result;
+       long long result;
 
        __asm__ __volatile__("@ atomic64_read\n"
 "      ldrexd  %0, %H0, [%1]"
@@ -279,10 +259,11 @@ static inline u64 atomic64_read(const atomic64_t *v)
        return result;
 }
 
-static inline void atomic64_set(atomic64_t *v, u64 i)
+static inline void atomic64_set(atomic64_t *v, long long i)
 {
-       u64 tmp;
+       long long tmp;
 
+       prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_set\n"
 "1:    ldrexd  %0, %H0, [%2]\n"
 "      strexd  %0, %3, %H3, [%2]\n"
@@ -294,15 +275,16 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
 }
 #endif
 
-static inline void atomic64_add(u64 i, atomic64_t *v)
+static inline void atomic64_add(long long i, atomic64_t *v)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
+       prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_add\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
-"      adds    %0, %0, %4\n"
-"      adc     %H0, %H0, %H4\n"
+"      adds    %Q0, %Q0, %Q4\n"
+"      adc     %R0, %R0, %R4\n"
 "      strexd  %1, %0, %H0, [%3]\n"
 "      teq     %1, #0\n"
 "      bne     1b"
@@ -311,17 +293,17 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
        : "cc");
 }
 
-static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
        smp_mb();
 
        __asm__ __volatile__("@ atomic64_add_return\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
-"      adds    %0, %0, %4\n"
-"      adc     %H0, %H0, %H4\n"
+"      adds    %Q0, %Q0, %Q4\n"
+"      adc     %R0, %R0, %R4\n"
 "      strexd  %1, %0, %H0, [%3]\n"
 "      teq     %1, #0\n"
 "      bne     1b"
@@ -334,15 +316,16 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
        return result;
 }
 
-static inline void atomic64_sub(u64 i, atomic64_t *v)
+static inline void atomic64_sub(long long i, atomic64_t *v)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
+       prefetchw(&v->counter);
        __asm__ __volatile__("@ atomic64_sub\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, %4\n"
-"      sbc     %H0, %H0, %H4\n"
+"      subs    %Q0, %Q0, %Q4\n"
+"      sbc     %R0, %R0, %R4\n"
 "      strexd  %1, %0, %H0, [%3]\n"
 "      teq     %1, #0\n"
 "      bne     1b"
@@ -351,17 +334,17 @@ static inline void atomic64_sub(u64 i, atomic64_t *v)
        : "cc");
 }
 
-static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
        smp_mb();
 
        __asm__ __volatile__("@ atomic64_sub_return\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, %4\n"
-"      sbc     %H0, %H0, %H4\n"
+"      subs    %Q0, %Q0, %Q4\n"
+"      sbc     %R0, %R0, %R4\n"
 "      strexd  %1, %0, %H0, [%3]\n"
 "      teq     %1, #0\n"
 "      bne     1b"
@@ -374,9 +357,10 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
        return result;
 }
 
-static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
+static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
+                                       long long new)
 {
-       u64 oldval;
+       long long oldval;
        unsigned long res;
 
        smp_mb();
@@ -398,9 +382,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
        return oldval;
 }
 
-static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
+static inline long long atomic64_xchg(atomic64_t *ptr, long long new)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
        smp_mb();
@@ -419,18 +403,18 @@ static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
        return result;
 }
 
-static inline u64 atomic64_dec_if_positive(atomic64_t *v)
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
 {
-       u64 result;
+       long long result;
        unsigned long tmp;
 
        smp_mb();
 
        __asm__ __volatile__("@ atomic64_dec_if_positive\n"
 "1:    ldrexd  %0, %H0, [%3]\n"
-"      subs    %0, %0, #1\n"
-"      sbc     %H0, %H0, #0\n"
-"      teq     %H0, #0\n"
+"      subs    %Q0, %Q0, #1\n"
+"      sbc     %R0, %R0, #0\n"
+"      teq     %R0, #0\n"
 "      bmi     2f\n"
 "      strexd  %1, %0, %H0, [%3]\n"
 "      teq     %1, #0\n"
@@ -445,9 +429,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v)
        return result;
 }
 
-static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
-       u64 val;
+       long long val;
        unsigned long tmp;
        int ret = 1;
 
@@ -459,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
 "      teqeq   %H0, %H5\n"
 "      moveq   %1, #0\n"
 "      beq     2f\n"
-"      adds    %0, %0, %6\n"
-"      adc     %H0, %H0, %H6\n"
+"      adds    %Q0, %Q0, %Q6\n"
+"      adc     %R0, %R0, %R6\n"
 "      strexd  %2, %0, %H0, [%4]\n"
 "      teq     %2, #0\n"
 "      bne     1b\n"
diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h
new file mode 100644 (file)
index 0000000..1714800
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * arch/arm/include/asm/bL_switcher.h
+ *
+ * Created by:  Nicolas Pitre, April 2012
+ * Copyright:   (C) 2012-2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_BL_SWITCHER_H
+#define ASM_BL_SWITCHER_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+typedef void (*bL_switch_completion_handler)(void *cookie);
+
+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
+                        bL_switch_completion_handler completer,
+                        void *completer_cookie);
+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
+{
+       return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
+}
+
+/*
+ * Register here to be notified about runtime enabling/disabling of
+ * the switcher.
+ *
+ * The notifier chain is called with the switcher activation lock held:
+ * the switcher will not be enabled or disabled during callbacks.
+ * Callbacks must not call bL_switcher_{get,put}_enabled().
+ */
+#define BL_NOTIFY_PRE_ENABLE   0
+#define BL_NOTIFY_POST_ENABLE  1
+#define BL_NOTIFY_PRE_DISABLE  2
+#define BL_NOTIFY_POST_DISABLE 3
+
+#ifdef CONFIG_BL_SWITCHER
+
+int bL_switcher_register_notifier(struct notifier_block *nb);
+int bL_switcher_unregister_notifier(struct notifier_block *nb);
+
+/*
+ * Use these functions to temporarily prevent enabling/disabling of
+ * the switcher.
+ * bL_switcher_get_enabled() returns true if the switcher is currently
+ * enabled.  Each call to bL_switcher_get_enabled() must be followed
+ * by a call to bL_switcher_put_enabled().  These functions are not
+ * recursive.
+ */
+bool bL_switcher_get_enabled(void);
+void bL_switcher_put_enabled(void);
+
+int bL_switcher_trace_trigger(void);
+int bL_switcher_get_logical_index(u32 mpidr);
+
+#else
+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline bool bL_switcher_get_enabled(void) { return false; }
+static inline void bL_switcher_put_enabled(void) { }
+static inline int bL_switcher_trace_trigger(void) { return 0; }
+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
+#endif /* CONFIG_BL_SWITCHER */
+
+#endif
index 7af5c6c3653a8061bad62211bbd3073af82ab3f0..b274bde24905a7503f60d38672346636d093854b 100644 (file)
@@ -2,6 +2,8 @@
 #define _ASMARM_BUG_H
 
 #include <linux/linkage.h>
+#include <linux/types.h>
+#include <asm/opcodes.h>
 
 #ifdef CONFIG_BUG
 
  */
 #ifdef CONFIG_THUMB2_KERNEL
 #define BUG_INSTR_VALUE 0xde02
-#define BUG_INSTR_TYPE ".hword "
+#define BUG_INSTR(__value) __inst_thumb16(__value)
 #else
 #define BUG_INSTR_VALUE 0xe7f001f2
-#define BUG_INSTR_TYPE ".word "
+#define BUG_INSTR(__value) __inst_arm(__value)
 #endif
 
 
@@ -33,7 +35,7 @@
 
 #define __BUG(__file, __line, __value)                         \
 do {                                                           \
-       asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n"        \
+       asm volatile("1:\t" BUG_INSTR(__value) "\n"  \
                ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
                "2:\t.asciz " #__file "\n"                      \
                ".popsection\n"                                 \
@@ -48,7 +50,7 @@ do {                                                          \
 
 #define __BUG(__file, __line, __value)                         \
 do {                                                           \
-       asm volatile(BUG_INSTR_TYPE #__value);                  \
+       asm volatile(BUG_INSTR(__value) "\n");                  \
        unreachable();                                          \
 } while (0)
 #endif  /* CONFIG_DEBUG_BUGVERBOSE */
index 15f2d5bf8875636e1514d377fc61b426534b3f74..ee753f1749cd795b03557273afdb3d0832d6fc8e 100644 (file)
@@ -435,4 +435,50 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
 #define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr))
 #define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr))
 
+/*
+ * Disabling cache access for one CPU in an ARMv7 SMP system is tricky.
+ * To do so we must:
+ *
+ * - Clear the SCTLR.C bit to prevent further cache allocations
+ * - Flush the desired level of cache
+ * - Clear the ACTLR "SMP" bit to disable local coherency
+ *
+ * ... and so without any intervening memory access in between those steps,
+ * not even to the stack.
+ *
+ * WARNING -- After this has been called:
+ *
+ * - No ldrex/strex (and similar) instructions must be used.
+ * - The CPU is obviously no longer coherent with the other CPUs.
+ * - This is unlikely to work as expected if Linux is running non-secure.
+ *
+ * Note:
+ *
+ * - This is known to apply to several ARMv7 processor implementations,
+ *   however some exceptions may exist.  Caveat emptor.
+ *
+ * - The clobber list is dictated by the call to v7_flush_dcache_*.
+ *   fp is preserved to the stack explicitly prior disabling the cache
+ *   since adding it to the clobber list is incompatible with having
+ *   CONFIG_FRAME_POINTER=y.  ip is saved as well if ever r12-clobbering
+ *   trampoline are inserted by the linker and to keep sp 64-bit aligned.
+ */
+#define v7_exit_coherency_flush(level) \
+       asm volatile( \
+       "stmfd  sp!, {fp, ip} \n\t" \
+       "mrc    p15, 0, r0, c1, c0, 0   @ get SCTLR \n\t" \
+       "bic    r0, r0, #"__stringify(CR_C)" \n\t" \
+       "mcr    p15, 0, r0, c1, c0, 0   @ set SCTLR \n\t" \
+       "isb    \n\t" \
+       "bl     v7_flush_dcache_"__stringify(level)" \n\t" \
+       "clrex  \n\t" \
+       "mrc    p15, 0, r0, c1, c0, 1   @ get ACTLR \n\t" \
+       "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t" \
+       "mcr    p15, 0, r0, c1, c0, 1   @ set ACTLR \n\t" \
+       "isb    \n\t" \
+       "dsb    \n\t" \
+       "ldmfd  sp!, {fp, ip}" \
+       : : : "r0","r1","r2","r3","r4","r5","r6","r7", \
+             "r9","r10","lr","memory" )
+
 #endif
index 4f009c10540dff2a2e7efd08b0671c2369547b90..df2fbba7efc80d57074a6053704a9c70119aae03 100644 (file)
@@ -223,6 +223,42 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
        return ret;
 }
 
+static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
+                                            unsigned long long old,
+                                            unsigned long long new)
+{
+       unsigned long long oldval;
+       unsigned long res;
+
+       __asm__ __volatile__(
+"1:    ldrexd          %1, %H1, [%3]\n"
+"      teq             %1, %4\n"
+"      teqeq           %H1, %H4\n"
+"      bne             2f\n"
+"      strexd          %0, %5, %H5, [%3]\n"
+"      teq             %0, #0\n"
+"      bne             1b\n"
+"2:"
+       : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
+       : "r" (ptr), "r" (old), "r" (new)
+       : "cc");
+
+       return oldval;
+}
+
+static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
+                                               unsigned long long old,
+                                               unsigned long long new)
+{
+       unsigned long long ret;
+
+       smp_mb();
+       ret = __cmpxchg64(ptr, old, new);
+       smp_mb();
+
+       return ret;
+}
+
 #define cmpxchg_local(ptr,o,n)                                         \
        ((__typeof__(*(ptr)))__cmpxchg_local((ptr),                     \
                                       (unsigned long)(o),              \
@@ -230,18 +266,16 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
                                       sizeof(*(ptr))))
 
 #define cmpxchg64(ptr, o, n)                                           \
-       ((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr),       \
-                                               atomic64_t,             \
-                                               counter),               \
-                                             (unsigned long long)(o),  \
-                                             (unsigned long long)(n)))
-
-#define cmpxchg64_local(ptr, o, n)                                     \
-       ((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr),        \
-                                               local64_t,              \
-                                               a),                     \
-                                            (unsigned long long)(o),   \
-                                            (unsigned long long)(n)))
+       ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr),                      \
+                                       (unsigned long long)(o),        \
+                                       (unsigned long long)(n)))
+
+#define cmpxchg64_relaxed(ptr, o, n)                                   \
+       ((__typeof__(*(ptr)))__cmpxchg64((ptr),                         \
+                                       (unsigned long long)(o),        \
+                                       (unsigned long long)(n)))
+
+#define cmpxchg64_local(ptr, o, n)     cmpxchg64_relaxed((ptr), (o), (n))
 
 #endif /* __LINUX_ARM_ARCH__ >= 6 */
 
index 9672e978d50df67d94c3dd86d23f3bcdd187c54d..acdde76b39bbae3064034fff78b9dd2b95bbd39c 100644 (file)
@@ -10,6 +10,7 @@
 #define CPUID_TLBTYPE  3
 #define CPUID_MPUIR    4
 #define CPUID_MPIDR    5
+#define CPUID_REVIDR   6
 
 #ifdef CONFIG_CPU_V7M
 #define CPUID_EXT_PFR0 0x40
index 2740c2a2df639361617f6fe484ead14f8625eaf2..fe3ea776dc34267724f377465134e52b39434fed 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
 
-#define NR_IPI 6
+#define NR_IPI 8
 
 typedef struct {
        unsigned int __softirq_pending;
index 0cf7a6b842ff4ad40847394ea5ff8fd77bce29e7..ad774f37c47cda0f6201d7a7aad7d967cec4786b 100644 (file)
@@ -24,8 +24,8 @@
 #define TRACER_TIMEOUT 10000
 
 #define etm_writel(t, v, x) \
-       (__raw_writel((v), (t)->etm_regs + (x)))
-#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x)))
+       (writel_relaxed((v), (t)->etm_regs + (x)))
+#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
 
 /* CoreSight Management Registers */
 #define CSMR_LOCKACCESS 0xfb0
 #define ETBFF_TRIGFL           BIT(10)
 
 #define etb_writel(t, v, x) \
-       (__raw_writel((v), (t)->etb_regs + (x)))
-#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x)))
+       (writel_relaxed((v), (t)->etb_regs + (x)))
+#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
 
 #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
 #define etm_unlock(t) \
index 48066ce9ea34f64961111c487fbc4f26e037b842..0a9d5dd932941a1f0635574904ecd47259b87295 100644 (file)
@@ -11,6 +11,7 @@
 #define __ARM_KGDB_H__
 
 #include <linux/ptrace.h>
+#include <asm/opcodes.h>
 
 /*
  * GDB assumes that we're a user process being debugged, so
@@ -41,7 +42,7 @@
 
 static inline void arch_kgdb_breakpoint(void)
 {
-       asm(".word 0xe7ffdeff");
+       asm(__inst_arm(0xe7ffdeff));
 }
 
 extern void kgdb_handle_bus_error(void);
index 402a2bc6aa687b94b09af6efa039c58fb80436d2..17a3fa2979e8ae5c5a56f88eda448f635c3e132c 100644 (file)
@@ -49,6 +49,7 @@ struct machine_desc {
        bool                    (*smp_init)(void);
        void                    (*fixup)(struct tag *, char **,
                                         struct meminfo *);
+       void                    (*init_meminfo)(void);
        void                    (*reserve)(void);/* reserve mem blocks  */
        void                    (*map_io)(void);/* IO mapping function  */
        void                    (*init_early)(void);
index fc82a88f5b69e556e9235583364b5149b2572b66..608516ebabfe6111a651f3a5ca6e63c607046fab 100644 (file)
@@ -41,6 +41,14 @@ extern void mcpm_entry_point(void);
  */
 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
 
+/*
+ * This sets an early poke i.e a value to be poked into some address
+ * from very early assembly code before the CPU is ungated.  The
+ * address must be physical, and if 0 then nothing will happen.
+ */
+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
+                        unsigned long poke_phys_addr, unsigned long poke_val);
+
 /*
  * CPU/cluster power operations API for higher subsystems to use.
  */
@@ -81,9 +89,39 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster);
  *
  * This will return if mcpm_platform_register() has not been called
  * previously in which case the caller should take appropriate action.
+ *
+ * On success, the CPU is not guaranteed to be truly halted until
+ * mcpm_cpu_power_down_finish() subsequently returns non-zero for the
+ * specified cpu.  Until then, other CPUs should make sure they do not
+ * trash memory the target CPU might be executing/accessing.
  */
 void mcpm_cpu_power_down(void);
 
+/**
+ * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and
+ *     make sure it is powered off
+ *
+ * @cpu: CPU number within given cluster
+ * @cluster: cluster number for the CPU
+ *
+ * Call this function to ensure that a pending powerdown has taken
+ * effect and the CPU is safely parked before performing non-mcpm
+ * operations that may affect the CPU (such as kexec trashing the
+ * kernel text).
+ *
+ * It is *not* necessary to call this function if you only need to
+ * serialise a pending powerdown with mcpm_cpu_power_up() or a wakeup
+ * event.
+ *
+ * Do not call this function unless the specified CPU has already
+ * called mcpm_cpu_power_down() or has committed to doing so.
+ *
+ * @return:
+ *     - zero if the CPU is in a safely parked state
+ *     - nonzero otherwise (e.g., timeout)
+ */
+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster);
+
 /**
  * mcpm_cpu_suspend - bring the calling CPU in a suspended state
  *
@@ -126,6 +164,7 @@ int mcpm_cpu_powered_up(void);
 struct mcpm_platform_ops {
        int (*power_up)(unsigned int cpu, unsigned int cluster);
        void (*power_down)(void);
+       int (*power_down_finish)(unsigned int cpu, unsigned int cluster);
        void (*suspend)(u64);
        void (*powered_up)(void);
 };
index e750a938fd3ce283ccd351cb73ed9c3d8df84e75..4dd21457ef9d2be8b1c94cac7eea97c8ef8cc1f6 100644 (file)
  * so that all we need to do is modify the 8-bit constant field.
  */
 #define __PV_BITS_31_24        0x81000000
+#define __PV_BITS_7_0  0x81
+
+extern u64 __pv_phys_offset;
+extern u64 __pv_offset;
+extern void fixup_pv_table(const void *, unsigned long);
+extern const void *__pv_table_begin, *__pv_table_end;
 
-extern unsigned long __pv_phys_offset;
 #define PHYS_OFFSET __pv_phys_offset
 
 #define __pv_stub(from,to,instr,type)                  \
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
        : "=r" (to)                                     \
        : "r" (from), "I" (type))
 
-static inline unsigned long __virt_to_phys(unsigned long x)
+#define __pv_stub_mov_hi(t)                            \
+       __asm__ volatile("@ __pv_stub_mov\n"            \
+       "1:     mov     %R0, %1\n"                      \
+       "       .pushsection .pv_table,\"a\"\n"         \
+       "       .long   1b\n"                           \
+       "       .popsection\n"                          \
+       : "=r" (t)                                      \
+       : "I" (__PV_BITS_7_0))
+
+#define __pv_add_carry_stub(x, y)                      \
+       __asm__ volatile("@ __pv_add_carry_stub\n"      \
+       "1:     adds    %Q0, %1, %2\n"                  \
+       "       adc     %R0, %R0, #0\n"                 \
+       "       .pushsection .pv_table,\"a\"\n"         \
+       "       .long   1b\n"                           \
+       "       .popsection\n"                          \
+       : "+r" (y)                                      \
+       : "r" (x), "I" (__PV_BITS_31_24)                \
+       : "cc")
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
 {
-       unsigned long t;
-       __pv_stub(x, t, "add", __PV_BITS_31_24);
+       phys_addr_t t;
+
+       if (sizeof(phys_addr_t) == 4) {
+               __pv_stub(x, t, "add", __PV_BITS_31_24);
+       } else {
+               __pv_stub_mov_hi(t);
+               __pv_add_carry_stub(x, t);
+       }
        return t;
 }
 
-static inline unsigned long __phys_to_virt(unsigned long x)
+static inline unsigned long __phys_to_virt(phys_addr_t x)
 {
        unsigned long t;
        __pv_stub(x, t, "sub", __PV_BITS_31_24);
        return t;
 }
+
 #else
-#define __virt_to_phys(x)      ((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x)      ((x) - PHYS_OFFSET + PAGE_OFFSET)
+
+static inline phys_addr_t __virt_to_phys(unsigned long x)
+{
+       return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
+}
+
+static inline unsigned long __phys_to_virt(phys_addr_t x)
+{
+       return x - PHYS_OFFSET + PAGE_OFFSET;
+}
+
 #endif
 #endif
 #endif /* __ASSEMBLY__ */
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
 
 static inline void *phys_to_virt(phys_addr_t x)
 {
-       return (void *)(__phys_to_virt((unsigned long)(x)));
+       return (void *)__phys_to_virt(x);
 }
 
 /*
  * Drivers should NOT use these either.
  */
 #define __pa(x)                        __virt_to_phys((unsigned long)(x))
-#define __va(x)                        ((void *)__phys_to_virt((unsigned long)(x)))
+#define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 
+extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
+
+/*
+ * These are for systems that have a hardware interconnect supported alias of
+ * physical memory for idmap purposes.  Most cases should leave these
+ * untouched.
+ */
+static inline phys_addr_t __virt_to_idmap(unsigned long x)
+{
+       if (arch_virt_to_idmap)
+               return arch_virt_to_idmap(x);
+       else
+               return __virt_to_phys(x);
+}
+
+#define virt_to_idmap(x)       __virt_to_idmap((unsigned long)(x))
+
 /*
  * Virtual <-> DMA view memory address translations
  * Again, these are *only* valid on the kernel direct mapped RAM
index 6f18da09668b5f324ad8c32e6fd6058e5b837e2a..64fd15159b7de710b81a47c0aa9d6bab3af1d691 100644 (file)
@@ -16,7 +16,7 @@ typedef struct {
 #ifdef CONFIG_CPU_HAS_ASID
 #define ASID_BITS      8
 #define ASID_MASK      ((~0ULL) << ASID_BITS)
-#define ASID(mm)       ((mm)->context.id.counter & ~ASID_MASK)
+#define ASID(mm)       ((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
 #else
 #define ASID(mm)       (0)
 #endif
index f97ee02386ee063ba12b78786c9a6cd8a4106676..86a659a19526c75a2ba3b91ce839925cc52d26de 100644 (file)
@@ -181,6 +181,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 
 #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
 
+/*
+ * We don't have huge page support for short descriptors, for the moment
+ * define empty stubs for use by pin_page_for_write.
+ */
+#define pmd_hugewillfault(pmd) (0)
+#define pmd_thp_or_huge(pmd)   (0)
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_PGTABLE_2LEVEL_H */
index 5689c18c85f5ebafb95a9bb2cc99fda227992976..39c54cfa03e9b103ef39982a43bac74d5436b791 100644 (file)
@@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 #define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)         (!(pmd_val(pmd) & PMD_SECT_RDONLY))
 
+#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
+#define pmd_thp_or_huge(pmd)   (pmd_huge(pmd) || pmd_trans_huge(pmd))
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
 #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
index 413f3876341cd6fd2e7bc4b1c6a71873cadaa887..c3d5fc124a054c6309ffacdb2845ff22fd5bfa56 100644 (file)
@@ -22,6 +22,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/ptrace.h>
 #include <asm/types.h>
+#include <asm/unified.h>
 
 #ifdef __KERNEL__
 #define STACK_TOP      ((current->personality & ADDR_LIMIT_32BIT) ? \
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
 #define KSTK_EIP(tsk)  task_pt_regs(tsk)->ARM_pc
 #define KSTK_ESP(tsk)  task_pt_regs(tsk)->ARM_sp
 
+#ifdef CONFIG_SMP
+#define __ALT_SMP_ASM(smp, up)                                         \
+       "9998:  " smp "\n"                                              \
+       "       .pushsection \".alt.smp.init\", \"a\"\n"                \
+       "       .long   9998b\n"                                        \
+       "       " up "\n"                                               \
+       "       .popsection\n"
+#else
+#define __ALT_SMP_ASM(smp, up) up
+#endif
+
 /*
  * Prefetching support - only ARMv5.
  */
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
 {
        __asm__ __volatile__(
                "pld\t%a0"
-               :
-               : "p" (ptr)
-               : "cc");
+               :: "p" (ptr));
 }
 
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
 #define ARCH_HAS_PREFETCHW
-#define prefetchw(ptr) prefetch(ptr)
-
-#define ARCH_HAS_SPINLOCK_PREFETCH
-#define spin_lock_prefetch(x) do { } while (0)
-
+static inline void prefetchw(const void *ptr)
+{
+       __asm__ __volatile__(
+               ".arch_extension        mp\n"
+               __ALT_SMP_ASM(
+                       WASM(pldw)              "\t%a0",
+                       WASM(pld)               "\t%a0"
+               )
+               :: "p" (ptr));
+}
+#endif
 #endif
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
index c50f0560950110b9f60d21f9ea9647be4ee77688..8d6a089dfb7628fe166f3757cd8aeba3c8270f0d 100644 (file)
@@ -49,7 +49,7 @@ extern struct meminfo meminfo;
 #define bank_phys_end(bank)    ((bank)->start + (bank)->size)
 #define bank_phys_size(bank)   (bank)->size
 
-extern int arm_add_memory(phys_addr_t start, phys_addr_t size);
+extern int arm_add_memory(u64 start, u64 size);
 extern void early_print(const char *str, ...);
 extern void dump_machine_table(void);
 
index a8cae71caceb3fb89c1ec949063b7a0d621dbdca..22a3b9b5d4a16fd4ece50bdfc83859f6ea38352f 100644 (file)
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
 
+extern int register_ipi_completion(struct completion *completion, int cpu);
+
 struct smp_operations {
 #ifdef CONFIG_SMP
        /*
index 4f2c28060c9aa227c47e73ac6557c45add91128f..ef3c6072aa45345ae4594f22aebbe9a9ebc538f1 100644 (file)
@@ -5,21 +5,13 @@
 #error SMP not supported on pre-ARMv6 CPUs
 #endif
 
-#include <asm/processor.h>
+#include <linux/prefetch.h>
 
 /*
  * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
  * extensions, so when running on UP, we have to patch these instructions away.
  */
-#define ALT_SMP(smp, up)                                       \
-       "9998:  " smp "\n"                                      \
-       "       .pushsection \".alt.smp.init\", \"a\"\n"        \
-       "       .long   9998b\n"                                \
-       "       " up "\n"                                       \
-       "       .popsection\n"
-
 #ifdef CONFIG_THUMB2_KERNEL
-#define SEV            ALT_SMP("sev.w", "nop.w")
 /*
  * For Thumb-2, special care is needed to ensure that the conditional WFE
  * instruction really does assemble to exactly 4 bytes (as required by
  * the assembler won't change IT instructions which are explicitly present
  * in the input.
  */
-#define WFE(cond)      ALT_SMP(                \
+#define WFE(cond)      __ALT_SMP_ASM(          \
        "it " cond "\n\t"                       \
        "wfe" cond ".n",                        \
                                                \
        "nop.w"                                 \
 )
 #else
-#define SEV            ALT_SMP("sev", "nop")
-#define WFE(cond)      ALT_SMP("wfe" cond, "nop")
+#define WFE(cond)      __ALT_SMP_ASM("wfe" cond, "nop")
 #endif
 
+#define SEV            __ALT_SMP_ASM(WASM(sev), WASM(nop))
+
 static inline void dsb_sev(void)
 {
 #if __LINUX_ARM_ARCH__ >= 7
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
        u32 newval;
        arch_spinlock_t lockval;
 
+       prefetchw(&lock->slock);
        __asm__ __volatile__(
 "1:    ldrex   %0, [%3]\n"
 "      add     %1, %0, %4\n"
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
        unsigned long contended, res;
        u32 slock;
 
+       prefetchw(&lock->slock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%3]\n"
@@ -127,10 +122,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
        dsb_sev();
 }
 
+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+       return lock.tickets.owner == lock.tickets.next;
+}
+
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-       struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
-       return tickets.owner != tickets.next;
+       return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
@@ -152,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 {
        unsigned long tmp;
 
+       prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:    ldrex   %0, [%1]\n"
 "      teq     %0, #0\n"
@@ -170,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 {
        unsigned long contended, res;
 
+       prefetchw(&rw->lock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%2]\n"
@@ -203,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 }
 
 /* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x)         ((x)->lock == 0)
+#define arch_write_can_lock(x)         (ACCESS_ONCE((x)->lock) == 0)
 
 /*
  * Read locks are a bit more hairy:
@@ -221,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 {
        unsigned long tmp, tmp2;
 
+       prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:    ldrex   %0, [%2]\n"
 "      adds    %0, %0, #1\n"
@@ -241,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 
        smp_mb();
 
+       prefetchw(&rw->lock);
        __asm__ __volatile__(
 "1:    ldrex   %0, [%2]\n"
 "      sub     %0, %0, #1\n"
@@ -259,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 {
        unsigned long contended, res;
 
+       prefetchw(&rw->lock);
        do {
                __asm__ __volatile__(
                "       ldrex   %0, [%2]\n"
@@ -280,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 }
 
 /* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x)          ((x)->lock < 0x80000000)
+#define arch_read_can_lock(x)          (ACCESS_ONCE((x)->lock) < 0x80000000)
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
index b262d2f8b4784eba5b6805d431468c285c434b88..47663fcb10ad7aad7e3bc87f31636a7a77342e36 100644 (file)
@@ -25,7 +25,7 @@ typedef struct {
 #define __ARCH_SPIN_LOCK_UNLOCKED      { { 0 } }
 
 typedef struct {
-       volatile unsigned int lock;
+       u32 lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED                { 0 }
index 38960264040cd989068b7e897979194bd5d30bc4..def9e570199f90a0c42dc7da0f8998fba6a0ab39 100644 (file)
@@ -560,37 +560,6 @@ static inline void __flush_bp_all(void)
                asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
 }
 
-#include <asm/cputype.h>
-#ifdef CONFIG_ARM_ERRATA_798181
-static inline int erratum_a15_798181(void)
-{
-       unsigned int midr = read_cpuid_id();
-
-       /* Cortex-A15 r0p0..r3p2 affected */
-       if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2)
-               return 0;
-       return 1;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
-       /*
-        * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0.
-        */
-       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
-       dsb(ish);
-}
-#else
-static inline int erratum_a15_798181(void)
-{
-       return 0;
-}
-
-static inline void dummy_flush_tlb_a15_erratum(void)
-{
-}
-#endif
-
 /*
  *     flush_pmd_entry
  *
@@ -697,4 +666,21 @@ extern void flush_bp_all(void);
 
 #endif
 
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_ARM_ERRATA_798181
+extern void erratum_a15_798181_init(void);
+#else
+static inline void erratum_a15_798181_init(void) {}
+#endif
+extern bool (*erratum_a15_798181_handler)(void);
+
+static inline bool erratum_a15_798181(void)
+{
+       if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) &&
+               erratum_a15_798181_handler))
+               return erratum_a15_798181_handler();
+       return false;
+}
+#endif
+
 #endif
index f5989f46b4d2d450f18b24faa946de750394ec13..b88beaba6b4a5cf9c0ccded73723c4ea41539167 100644 (file)
@@ -38,6 +38,8 @@
 #ifdef __ASSEMBLY__
 #define W(instr)       instr.w
 #define BSYM(sym)      sym + 1
+#else
+#define WASM(instr)    #instr ".w"
 #endif
 
 #else  /* !CONFIG_THUMB2_KERNEL */
@@ -50,6 +52,8 @@
 #ifdef __ASSEMBLY__
 #define W(instr)       instr
 #define BSYM(sym)      sym
+#else
+#define WASM(instr)    #instr
 #endif
 
 #endif /* CONFIG_THUMB2_KERNEL */
diff --git a/arch/arm/include/debug/efm32.S b/arch/arm/include/debug/efm32.S
new file mode 100644 (file)
index 0000000..2265a19
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2013 Pengutronix
+ * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define UARTn_CMD              0x000c
+#define UARTn_CMD_TXEN                 0x0004
+
+#define        UARTn_STATUS            0x0010
+#define        UARTn_STATUS_TXC                0x0020
+#define        UARTn_STATUS_TXBL               0x0040
+
+#define        UARTn_TXDATA            0x0034
+
+               .macro  addruart, rx, tmp
+               ldr     \rx, =(CONFIG_DEBUG_UART_PHYS)
+
+               /*
+                * enable TX. The driver might disable it to save energy. We
+                * don't care about disabling at the end as during debug power
+                * consumption isn't that important.
+                */
+               ldr     \tmp, =(UARTn_CMD_TXEN)
+               str     \tmp, [\rx, #UARTn_CMD]
+               .endm
+
+               .macro  senduart,rd,rx
+               strb    \rd, [\rx, #UARTn_TXDATA]
+               .endm
+
+               .macro  waituart,rd,rx
+1001:          ldr     \rd, [\rx, #UARTn_STATUS]
+               tst     \rd, #UARTn_STATUS_TXBL
+               beq     1001b
+               .endm
+
+               .macro  busyuart,rd,rx
+1001:          ldr     \rd, [\rx, UARTn_STATUS]
+               tst     \rd, #UARTn_STATUS_TXC
+               bne     1001b
+               .endm
index 9166e1bc470e0153107f059ddd5fbec2570fb5a6..9d653d475903b56ea14e3e7c568319c963d9ceca 100644 (file)
 #ifdef CONFIG_DEBUG_MSM8960_UART
 #define MSM_DEBUG_UART_BASE    0xF0040000
 #define MSM_DEBUG_UART_PHYS    0x16440000
+#endif
+
+#ifdef CONFIG_DEBUG_MSM8974_UART
+#define MSM_DEBUG_UART_BASE    0xFA71E000
+#define MSM_DEBUG_UART_PHYS    0xF991E000
 #endif
 
        .macro  addruart, rp, rv, tmp
index 37c6895b87e6d72ce5837e57a4bae0a367339cb2..92ef808a23377275124a59550b6ab86f3e163486 100644 (file)
 
                .macro  waituart,rd,rx
 1001:          ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(      rev     \rd, \rd )
                tst     \rd, #UART01x_FR_TXFF
                bne     1001b
                .endm
 
                .macro  busyuart,rd,rx
 1001:          ldr     \rd, [\rx, #UART01x_FR]
+ ARM_BE8(      rev     \rd, \rd )
                tst     \rd, #UART01x_FR_BUSY
                bne     1001b
                .endm
index 18d76fd5a2afb2bf27b91980f29c77094e6638c0..70a1c9da30ca39d4d4d79e8e73c3b8aec0d607e3 100644 (file)
@@ -7,6 +7,7 @@ header-y += hwcap.h
 header-y += ioctls.h
 header-y += kvm_para.h
 header-y += mman.h
+header-y += perf_regs.h
 header-y += posix_types.h
 header-y += ptrace.h
 header-y += setup.h
diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h
new file mode 100644 (file)
index 0000000..ce59448
--- /dev/null
@@ -0,0 +1,23 @@
+#ifndef _ASM_ARM_PERF_REGS_H
+#define _ASM_ARM_PERF_REGS_H
+
+enum perf_event_arm_regs {
+       PERF_REG_ARM_R0,
+       PERF_REG_ARM_R1,
+       PERF_REG_ARM_R2,
+       PERF_REG_ARM_R3,
+       PERF_REG_ARM_R4,
+       PERF_REG_ARM_R5,
+       PERF_REG_ARM_R6,
+       PERF_REG_ARM_R7,
+       PERF_REG_ARM_R8,
+       PERF_REG_ARM_R9,
+       PERF_REG_ARM_R10,
+       PERF_REG_ARM_FP,
+       PERF_REG_ARM_IP,
+       PERF_REG_ARM_SP,
+       PERF_REG_ARM_LR,
+       PERF_REG_ARM_PC,
+       PERF_REG_ARM_MAX,
+};
+#endif /* _ASM_ARM_PERF_REGS_H */
index 5140df5f23aa485214914a8dfbfdf31dc04a5691..a30fc9be9e9e6abc8ccbedb17fb61bddba71bcd7 100644 (file)
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
 
 obj-y          := elf.o entry-common.o irq.o opcodes.o \
                   process.o ptrace.o return_address.o \
-                  setup.o signal.o stacktrace.o sys_arm.o time.o traps.o
+                  setup.o signal.o sigreturn_codes.o \
+                  stacktrace.o sys_arm.o time.o traps.o
 
 obj-$(CONFIG_ATAGS)            += atags_parse.o
 obj-$(CONFIG_ATAGS_PROC)       += atags_proc.o
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3)                += xscale-cp0.o
 obj-$(CONFIG_CPU_MOHAWK)       += xscale-cp0.o
 obj-$(CONFIG_CPU_PJ4)          += pj4-cp0.o
 obj-$(CONFIG_IWMMXT)           += iwmmxt.o
+obj-$(CONFIG_PERF_EVENTS)      += perf_regs.o
 obj-$(CONFIG_HW_PERF_EVENTS)   += perf_event.o perf_event_cpu.o
 AFLAGS_iwmmxt.o                        := -Wa,-mcpu=iwmmxt
 obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
index 60d3b738d4200987e76c75b2e9da513920087a89..1f031ddd0667a3e842317a90c59db3acc2284894 100644 (file)
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
 
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 EXPORT_SYMBOL(__pv_phys_offset);
+EXPORT_SYMBOL(__pv_offset);
 #endif
index 9cbe70c8b0ef7b8d16a806602608fba205966d31..b3fb8c9e1ff2d75f15666a9233dee2d1b3ccb6bc 100644 (file)
@@ -192,6 +192,7 @@ __dabt_svc:
        svc_entry
        mov     r2, sp
        dabt_helper
+ THUMB(        ldr     r5, [sp, #S_PSR]        )       @ potentially updated CPSR
        svc_exit r5                             @ return from exception
  UNWIND(.fnend         )
 ENDPROC(__dabt_svc)
@@ -416,9 +417,8 @@ __und_usr:
        bne     __und_usr_thumb
        sub     r4, r2, #4                      @ ARM instr at LR - 4
 1:     ldrt    r0, [r4]
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r0, r0                          @ little endian instruction
-#endif
+ ARM_BE8(rev   r0, r0)                         @ little endian instruction
+
        @ r0 = 32-bit ARM instruction which caused the exception
        @ r2 = PC value for the following instruction (:= regs->ARM_pc)
        @ r4 = PC value for the faulting instruction
index bc6bd9683ba4555d9713e1693b258e6204fc90a5..a2dcafdf1bc89a176f80dda748625a615717979a 100644 (file)
@@ -393,9 +393,7 @@ ENTRY(vector_swi)
 #else
  USER( ldr     r10, [lr, #-4]          )       @ get SWI instruction
 #endif
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r10, r10                        @ little endian instruction
-#endif
+ ARM_BE8(rev   r10, r10)                       @ little endian instruction
 
 #elif defined(CONFIG_AEABI)
 
index 476de57dcef284602e126e29e2da03465df8ff09..7801866e626a2a1a4631d9e3e3fbd3c27ddda429 100644 (file)
@@ -77,6 +77,7 @@
 
        __HEAD
 ENTRY(stext)
+ ARM_BE8(setend        be )                    @ ensure we are in BE8 mode
 
  THUMB(        adr     r9, BSYM(1f)    )       @ Kernel is always entered in ARM.
  THUMB(        bx      r9              )       @ If this is a Thumb-2 kernel,
@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
         * the processor type - there is no need to check the machine type
         * as it has already been validated by the primary processor.
         */
+
+ ARM_BE8(setend        be)                             @ ensure we are in BE8 mode
+
 #ifdef CONFIG_ARM_VIRT_EXT
        bl      __hyp_stub_install_secondary
 #endif
@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
        ldmfd   sp!, {r4 - r6, pc}
 ENDPROC(fixup_smp)
 
+#ifdef __ARMEB__
+#define LOW_OFFSET     0x4
+#define HIGH_OFFSET    0x0
+#else
+#define LOW_OFFSET     0x0
+#define HIGH_OFFSET    0x4
+#endif
+
 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
 
 /* __fixup_pv_table - patch the stub instructions with the delta between
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
        __HEAD
 __fixup_pv_table:
        adr     r0, 1f
-       ldmia   r0, {r3-r5, r7}
-       sub     r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
+       ldmia   r0, {r3-r7}
+       mvn     ip, #0
+       subs    r3, r0, r3      @ PHYS_OFFSET - PAGE_OFFSET
        add     r4, r4, r3      @ adjust table start address
        add     r5, r5, r3      @ adjust table end address
-       add     r7, r7, r3      @ adjust __pv_phys_offset address
-       str     r8, [r7]        @ save computed PHYS_OFFSET to __pv_phys_offset
+       add     r6, r6, r3      @ adjust __pv_phys_offset address
+       add     r7, r7, r3      @ adjust __pv_offset address
+       str     r8, [r6, #LOW_OFFSET]   @ save computed PHYS_OFFSET to __pv_phys_offset
+       strcc   ip, [r7, #HIGH_OFFSET]  @ save to __pv_offset high bits
        mov     r6, r3, lsr #24 @ constant for add/sub instructions
        teq     r3, r6, lsl #24 @ must be 16MiB aligned
 THUMB( it      ne              @ cross section branch )
        bne     __error
-       str     r6, [r7, #4]    @ save to __pv_offset
+       str     r3, [r7, #LOW_OFFSET]   @ save to __pv_offset low bits
        b       __fixup_a_pv_table
 ENDPROC(__fixup_pv_table)
 
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
        .long   __pv_table_begin
        .long   __pv_table_end
 2:     .long   __pv_phys_offset
+       .long   __pv_offset
 
        .text
 __fixup_a_pv_table:
+       adr     r0, 3f
+       ldr     r6, [r0]
+       add     r6, r6, r3
+       ldr     r0, [r6, #HIGH_OFFSET]  @ pv_offset high word
+       ldr     r6, [r6, #LOW_OFFSET]   @ pv_offset low word
+       mov     r6, r6, lsr #24
+       cmn     r0, #1
 #ifdef CONFIG_THUMB2_KERNEL
+       moveq   r0, #0x200000   @ set bit 21, mov to mvn instruction
        lsls    r6, #24
        beq     2f
        clz     r7, r6
@@ -601,18 +625,42 @@ __fixup_a_pv_table:
        b       2f
 1:     add     r7, r3
        ldrh    ip, [r7, #2]
-       and     ip, 0x8f00
-       orr     ip, r6  @ mask in offset bits 31-24
+ARM_BE8(rev16  ip, ip)
+       tst     ip, #0x4000
+       and     ip, #0x8f00
+       orrne   ip, r6  @ mask in offset bits 31-24
+       orreq   ip, r0  @ mask in offset bits 7-0
+ARM_BE8(rev16  ip, ip)
        strh    ip, [r7, #2]
+       bne     2f
+       ldrh    ip, [r7]
+ARM_BE8(rev16  ip, ip)
+       bic     ip, #0x20
+       orr     ip, ip, r0, lsr #16
+ARM_BE8(rev16  ip, ip)
+       strh    ip, [r7]
 2:     cmp     r4, r5
        ldrcc   r7, [r4], #4    @ use branch for delay slot
        bcc     1b
        bx      lr
 #else
+       moveq   r0, #0x400000   @ set bit 22, mov to mvn instruction
        b       2f
 1:     ldr     ip, [r7, r3]
+#ifdef CONFIG_CPU_ENDIAN_BE8
+       @ in BE8, we load data in BE, but instructions still in LE
+       bic     ip, ip, #0xff000000
+       tst     ip, #0x000f0000 @ check the rotation field
+       orrne   ip, ip, r6, lsl #24 @ mask in offset bits 31-24
+       biceq   ip, ip, #0x00004000 @ clear bit 22
+       orreq   ip, ip, r0, lsl #24 @ mask in offset bits 7-0
+#else
        bic     ip, ip, #0x000000ff
-       orr     ip, ip, r6      @ mask in offset bits 31-24
+       tst     ip, #0xf00      @ check the rotation field
+       orrne   ip, ip, r6      @ mask in offset bits 31-24
+       biceq   ip, ip, #0x400000       @ clear bit 22
+       orreq   ip, ip, r0      @ mask in offset bits 7-0
+#endif
        str     ip, [r7, r3]
 2:     cmp     r4, r5
        ldrcc   r7, [r4], #4    @ use branch for delay slot
@@ -621,28 +669,30 @@ __fixup_a_pv_table:
 #endif
 ENDPROC(__fixup_a_pv_table)
 
+       .align
+3:     .long __pv_offset
+
 ENTRY(fixup_pv_table)
        stmfd   sp!, {r4 - r7, lr}
-       ldr     r2, 2f                  @ get address of __pv_phys_offset
        mov     r3, #0                  @ no offset
        mov     r4, r0                  @ r0 = table start
        add     r5, r0, r1              @ r1 = table size
-       ldr     r6, [r2, #4]            @ get __pv_offset
        bl      __fixup_a_pv_table
        ldmfd   sp!, {r4 - r7, pc}
 ENDPROC(fixup_pv_table)
 
-       .align
-2:     .long   __pv_phys_offset
-
        .data
        .globl  __pv_phys_offset
        .type   __pv_phys_offset, %object
 __pv_phys_offset:
-       .long   0
-       .size   __pv_phys_offset, . - __pv_phys_offset
+       .quad   0
+       .size   __pv_phys_offset, . -__pv_phys_offset
+
+       .globl  __pv_offset
+       .type   __pv_offset, %object
 __pv_offset:
-       .long   0
+       .quad   0
+       .size   __pv_offset, . -__pv_offset
 #endif
 
 #include "head-common.S"
index 7b95de6013571df3528716f2bba00f8dfd13badf..3d446605cbf84b89890bdf5bb2398a64a8401120 100644 (file)
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
                /* Breakpoint */
                ctrl_base = ARM_BASE_BCR;
                val_base = ARM_BASE_BVR;
-               slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                max_slots = core_num_brps;
        } else {
                /* Watchpoint */
                ctrl_base = ARM_BASE_WCR;
                val_base = ARM_BASE_WVR;
-               slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                max_slots = core_num_wrps;
        }
 
@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
        if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
                /* Breakpoint */
                base = ARM_BASE_BCR;
-               slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                max_slots = core_num_brps;
        } else {
                /* Watchpoint */
                base = ARM_BASE_WCR;
-               slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                max_slots = core_num_wrps;
        }
 
@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
        struct arch_hw_breakpoint *info;
        struct arch_hw_breakpoint_ctrl ctrl;
 
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
 
        for (i = 0; i < core_num_wrps; ++i) {
                rcu_read_lock();
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc)
        struct perf_event *wp, **slots;
        struct arch_hw_breakpoint *info;
 
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
 
        for (i = 0; i < core_num_wrps; ++i) {
                rcu_read_lock();
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
        struct arch_hw_breakpoint *info;
        struct arch_hw_breakpoint_ctrl ctrl;
 
-       slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+       slots = this_cpu_ptr(bp_on_reg);
 
        /* The exception entry code places the amended lr in the PC. */
        addr = regs->ARM_pc;
index 170e9f34003f414030c6070e8c913df21c5091cd..a7b621ece23d3c729d681e8004952c320215e9fc 100644 (file)
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 
 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+       __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
        kcb->kprobe_status = kcb->prev_kprobe.status;
 }
 
 static void __kprobes set_current_kprobe(struct kprobe *p)
 {
-       __get_cpu_var(current_kprobe) = p;
+       __this_cpu_write(current_kprobe, p);
 }
 
 static void __kprobes
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                        continue;
 
                if (ri->rp && ri->rp->handler) {
-                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                       __this_cpu_write(current_kprobe, &ri->rp->kp);
                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
                        ri->rp->handler(ri, regs);
-                       __get_cpu_var(current_kprobe) = NULL;
+                       __this_cpu_write(current_kprobe, NULL);
                }
 
                orig_ret_address = (unsigned long)ri->ret_addr;
index c9dfff3b80082e60fa2244efd1a23216293356ec..45e478157278e331ac6474ca5dbac859415b0fff 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/sections.h>
 #include <asm/smp_plat.h>
 #include <asm/unwind.h>
+#include <asm/opcodes.h>
 
 #ifdef CONFIG_XIP_KERNEL
 /*
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                Elf32_Sym *sym;
                const char *symname;
                s32 offset;
+               u32 tmp;
 #ifdef CONFIG_THUMB2_KERNEL
                u32 upper, lower, sign, j1, j2;
 #endif
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                case R_ARM_PC24:
                case R_ARM_CALL:
                case R_ARM_JUMP24:
-                       offset = (*(u32 *)loc & 0x00ffffff) << 2;
+                       offset = __mem_to_opcode_arm(*(u32 *)loc);
+                       offset = (offset & 0x00ffffff) << 2;
                        if (offset & 0x02000000)
                                offset -= 0x04000000;
 
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        }
 
                        offset >>= 2;
+                       offset &= 0x00ffffff;
 
-                       *(u32 *)loc &= 0xff000000;
-                       *(u32 *)loc |= offset & 0x00ffffff;
+                       *(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
+                       *(u32 *)loc |= __opcode_to_mem_arm(offset);
                        break;
 
               case R_ARM_V4BX:
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        * other bits to re-code instruction as
                        * MOV PC,Rm.
                        */
-                      *(u32 *)loc &= 0xf000000f;
-                      *(u32 *)loc |= 0x01a0f000;
+                      *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
+                      *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
                       break;
 
                case R_ARM_PREL31:
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
                case R_ARM_MOVW_ABS_NC:
                case R_ARM_MOVT_ABS:
-                       offset = *(u32 *)loc;
+                       offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
                        offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
                        offset = (offset ^ 0x8000) - 0x8000;
 
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
                                offset >>= 16;
 
-                       *(u32 *)loc &= 0xfff0f000;
-                       *(u32 *)loc |= ((offset & 0xf000) << 4) |
-                                       (offset & 0x0fff);
+                       tmp &= 0xfff0f000;
+                       tmp |= ((offset & 0xf000) << 4) |
+                               (offset & 0x0fff);
+
+                       *(u32 *)loc = __opcode_to_mem_arm(tmp);
                        break;
 
 #ifdef CONFIG_THUMB2_KERNEL
                case R_ARM_THM_CALL:
                case R_ARM_THM_JUMP24:
-                       upper = *(u16 *)loc;
-                       lower = *(u16 *)(loc + 2);
+                       upper = __mem_to_opcode_thumb16(*(u16 *)loc);
+                       lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
 
                        /*
                         * 25 bit signed address range (Thumb-2 BL and B.W
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        sign = (offset >> 24) & 1;
                        j1 = sign ^ (~(offset >> 23) & 1);
                        j2 = sign ^ (~(offset >> 22) & 1);
-                       *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) |
+                       upper = (u16)((upper & 0xf800) | (sign << 10) |
                                            ((offset >> 12) & 0x03ff));
-                       *(u16 *)(loc + 2) = (u16)((lower & 0xd000) |
-                                                 (j1 << 13) | (j2 << 11) |
-                                                 ((offset >> 1) & 0x07ff));
+                       lower = (u16)((lower & 0xd000) |
+                                     (j1 << 13) | (j2 << 11) |
+                                     ((offset >> 1) & 0x07ff));
+
+                       *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                       *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                        break;
 
                case R_ARM_THM_MOVW_ABS_NC:
                case R_ARM_THM_MOVT_ABS:
-                       upper = *(u16 *)loc;
-                       lower = *(u16 *)(loc + 2);
+                       upper = __mem_to_opcode_thumb16(*(u16 *)loc);
+                       lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
 
                        /*
                         * MOVT/MOVW instructions encoding in Thumb-2:
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
                        if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
                                offset >>= 16;
 
-                       *(u16 *)loc = (u16)((upper & 0xfbf0) |
-                                           ((offset & 0xf000) >> 12) |
-                                           ((offset & 0x0800) >> 1));
-                       *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) |
-                                                 ((offset & 0x0700) << 4) |
-                                                 (offset & 0x00ff));
+                       upper = (u16)((upper & 0xfbf0) |
+                                     ((offset & 0xf000) >> 12) |
+                                     ((offset & 0x0800) >> 1));
+                       lower = (u16)((lower & 0x8f00) |
+                                     ((offset & 0x0700) << 4) |
+                                     (offset & 0x00ff));
+                       *(u16 *)loc = __opcode_to_mem_thumb16(upper);
+                       *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
                        break;
 #endif
 
index e186ee1e63f6c85261f96844a594080e719c2e07..bc3f2efa0d86b4ff55d6b19833eae688b111fd27 100644 (file)
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
               struct perf_event *event)
 {
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-       struct pmu *leader_pmu = event->group_leader->pmu;
 
        if (is_software_event(event))
                return 1;
 
-       if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
+       if (event->state < PERF_EVENT_STATE_OFF)
                return 1;
 
        if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
index 8d6147b2001f82eca02c738265f9477dc2dbaefa..d85055cd24bacc55cde9e99f1547afbfe33317ae 100644 (file)
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);
 
 static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
 {
-       return &__get_cpu_var(cpu_hw_events);
+       return this_cpu_ptr(&cpu_hw_events);
 }
 
 static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c
new file mode 100644 (file)
index 0000000..6e4379c
--- /dev/null
@@ -0,0 +1,30 @@
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+       if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
+               return 0;
+
+       return regs->uregs[idx];
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+       if (!mask || mask & REG_RESERVED)
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+       return PERF_SAMPLE_REGS_ABI_32;
+}
index 5d65438685d8516f6455393940e4bdaed107699a..6a1b8a81b1ae448168572a9558aaf026f9e7e47e 100644 (file)
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
 #endif
 
 extern void paging_init(const struct machine_desc *desc);
+extern void early_paging_init(const struct machine_desc *,
+                             struct proc_info_list *);
 extern void sanity_check_meminfo(void);
 extern enum reboot_mode reboot_mode;
 extern void setup_dma_zone(const struct machine_desc *desc);
@@ -599,6 +601,8 @@ static void __init setup_processor(void)
        elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
 #endif
 
+       erratum_a15_798181_init();
+
        feat_v6_fixup();
 
        cacheid_init();
@@ -619,9 +623,10 @@ void __init dump_machine_table(void)
                /* can't use cpu_relax() here as it may require MMU setup */;
 }
 
-int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
+int __init arm_add_memory(u64 start, u64 size)
 {
        struct membank *bank = &meminfo.bank[meminfo.nr_banks];
+       u64 aligned_start;
 
        if (meminfo.nr_banks >= NR_BANKS) {
                printk(KERN_CRIT "NR_BANKS too low, "
@@ -634,10 +639,16 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
         * Size is appropriately rounded down, start is rounded up.
         */
        size -= start & ~PAGE_MASK;
-       bank->start = PAGE_ALIGN(start);
+       aligned_start = PAGE_ALIGN(start);
 
-#ifndef CONFIG_ARM_LPAE
-       if (bank->start + size < bank->start) {
+#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
+       if (aligned_start > ULONG_MAX) {
+               printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
+                      "32-bit physical address space\n", (long long)start);
+               return -EINVAL;
+       }
+
+       if (aligned_start + size > ULONG_MAX) {
                printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
                        "32-bit physical address space\n", (long long)start);
                /*
@@ -645,10 +656,11 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
                 * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
                 * This means we lose a page after masking.
                 */
-               size = ULONG_MAX - bank->start;
+               size = ULONG_MAX - aligned_start;
        }
 #endif
 
+       bank->start = aligned_start;
        bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);
 
        /*
@@ -669,8 +681,8 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
 static int __init early_mem(char *p)
 {
        static int usermem __initdata = 0;
-       phys_addr_t size;
-       phys_addr_t start;
+       u64 size;
+       u64 start;
        char *endp;
 
        /*
@@ -878,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
        parse_early_param();
 
        sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
+
+       early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
        sanity_check_meminfo();
        arm_memblock_init(&meminfo, mdesc);
 
index ab330422527203417472a21727b1062c8a38e70f..04d63880037f9c293741230f95e9520c19ee3dee 100644 (file)
 #include <asm/unistd.h>
 #include <asm/vfp.h>
 
-/*
- * For ARM syscalls, we encode the syscall number into the instruction.
- */
-#define SWI_SYS_SIGRETURN      (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-#define SWI_SYS_RT_SIGRETURN   (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-
-/*
- * With EABI, the syscall number has to be loaded into r7.
- */
-#define MOV_R7_NR_SIGRETURN    (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-
-/*
- * For Thumb syscalls, we pass the syscall number via r7.  We therefore
- * need two 16-bit instructions.
- */
-#define SWI_THUMB_SIGRETURN    (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
-#define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
-
-static const unsigned long sigreturn_codes[7] = {
-       MOV_R7_NR_SIGRETURN,    SWI_SYS_SIGRETURN,    SWI_THUMB_SIGRETURN,
-       MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
-};
+extern const unsigned long sigreturn_codes[7];
 
 static unsigned long signal_return_offset;
 
@@ -375,12 +353,18 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
                 */
                thumb = handler & 1;
 
-               if (thumb) {
-                       cpsr |= PSR_T_BIT;
 #if __LINUX_ARM_ARCH__ >= 7
-                       /* clear the If-Then Thumb-2 execution state */
-                       cpsr &= ~PSR_IT_MASK;
+               /*
+                * Clear the If-Then Thumb-2 execution state
+                * ARM spec requires this to be all 000s in ARM mode
+                * Snapdragon S4/Krait misbehaves on a Thumb=>ARM
+                * signal transition without this.
+                */
+               cpsr &= ~PSR_IT_MASK;
 #endif
+
+               if (thumb) {
+                       cpsr |= PSR_T_BIT;
                } else
                        cpsr &= ~PSR_T_BIT;
        }
diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S
new file mode 100644 (file)
index 0000000..3c5d0f2
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * sigreturn_codes.S - code sinpets for sigreturn syscalls
+ *
+ * Created by: Victor Kamensky, 2013-08-13
+ * Copyright:  (C) 2013  Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <asm/unistd.h>
+
+/*
+ * For ARM syscalls, we encode the syscall number into the instruction.
+ * With EABI, the syscall number has to be loaded into r7. As result
+ * ARM syscall sequence snippet will have move and svc in .arm encoding
+ *
+ * For Thumb syscalls, we pass the syscall number via r7.  We therefore
+ * need two 16-bit instructions in .thumb encoding
+ *
+ * Please note sigreturn_codes code are not executed in place. Instead
+ * they just copied by kernel into appropriate places. Code inside of
+ * arch/arm/kernel/signal.c is very sensitive to layout of these code
+ * snippets.
+ */
+
+#if __LINUX_ARM_ARCH__ <= 4
+       /*
+        * Note we manually set minimally required arch that supports
+        * required thumb opcodes for early arch versions. It is OK
+        * for this file to be used in combination with other
+        * lower arch variants, since these code snippets are only
+        * used as input data.
+        */
+       .arch armv4t
+#endif
+
+       .section .rodata
+       .global sigreturn_codes
+       .type   sigreturn_codes, #object
+
+       .arm
+
+sigreturn_codes:
+
+       /* ARM sigreturn syscall code snippet */
+       mov     r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+       swi     #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+
+       /* Thumb sigreturn syscall code snippet */
+       .thumb
+       movs    r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
+       swi     #0
+
+       /* ARM sigreturn_rt syscall code snippet */
+       .arm
+       mov     r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+       swi     #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
+
+       /* Thumb sigreturn_rt syscall code snippet */
+       .thumb
+       movs    r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
+       swi     #0
+
+       /*
+        * Note on addtional space: setup_return in signal.c
+        * algorithm uses two words copy regardless whether
+        * it is thumb case or not, so we need additional
+        * word after real last entry.
+        */
+       .arm
+       .space  4
+
+       .size   sigreturn_codes, . - sigreturn_codes
index db1536b8b30b497fe11923dfe9f67a72f86f7954..b907d9b790ab7234171c713c4e6c9aed957868ea 100644 (file)
@@ -55,6 +55,7 @@
  * specific registers and some other data for resume.
  *  r0 = suspend function arg0
  *  r1 = suspend function
+ *  r2 = MPIDR value the resuming CPU will use
  */
 ENTRY(__cpu_suspend)
        stmfd   sp!, {r4 - r11, lr}
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
        mov     r5, sp                  @ current virtual SP
        add     r4, r4, #12             @ Space for pgd, virt sp, phys resume fn
        sub     sp, sp, r4              @ allocate CPU state on stack
-       stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
-       add     r0, sp, #8              @ save pointer to save block
-       mov     r1, r4                  @ size of save block
-       mov     r2, r5                  @ virtual SP
        ldr     r3, =sleep_save_sp
+       stmfd   sp!, {r0, r1}           @ save suspend func arg and pointer
        ldr     r3, [r3, #SLEEP_SAVE_SP_VIRT]
-       ALT_SMP(mrc p15, 0, r9, c0, c0, 5)
-        ALT_UP_B(1f)
-       ldr     r8, =mpidr_hash
-       /*
-        * This ldmia relies on the memory layout of the mpidr_hash
-        * struct mpidr_hash.
-        */
-       ldmia   r8, {r4-r7}     @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts
-       compute_mpidr_hash      lr, r5, r6, r7, r9, r4
-       add     r3, r3, lr, lsl #2
-1:
+       ALT_SMP(ldr r0, =mpidr_hash)
+       ALT_UP_B(1f)
+       /* This ldmia relies on the memory layout of the mpidr_hash struct */
+       ldmia   r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
+       compute_mpidr_hash      r0, r6, r7, r8, r2, r1
+       add     r3, r3, r0, lsl #2
+1:     mov     r2, r5                  @ virtual SP
+       mov     r1, r4                  @ size of save block
+       add     r0, sp, #8              @ pointer to save block
        bl      __cpu_suspend_save
        adr     lr, BSYM(cpu_suspend_abort)
        ldmfd   sp!, {r0, pc}           @ call suspend fn
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
        .data
        .align
 ENTRY(cpu_resume)
+ARM_BE8(setend be)                     @ ensure we are in BE mode
        mov     r1, #0
        ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
        ALT_UP_B(1f)
index 72024ea8a3a6c07038103e153527cb2454e4552f..dc894ab3622b1effac1b885bd919bf629f66dc5d 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/clockchips.h>
 #include <linux/completion.h>
 #include <linux/cpufreq.h>
+#include <linux/irq_work.h>
 
 #include <linux/atomic.h>
 #include <asm/smp.h>
@@ -66,6 +67,8 @@ enum ipi_msg_type {
        IPI_CALL_FUNC,
        IPI_CALL_FUNC_SINGLE,
        IPI_CPU_STOP,
+       IPI_IRQ_WORK,
+       IPI_COMPLETION,
 };
 
 static DECLARE_COMPLETION(cpu_running);
@@ -80,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
 
 static unsigned long get_arch_pgd(pgd_t *pgd)
 {
-       phys_addr_t pgdir = virt_to_phys(pgd);
+       phys_addr_t pgdir = virt_to_idmap(pgd);
        BUG_ON(pgdir & ARCH_PGD_MASK);
        return pgdir >> ARCH_PGD_SHIFT;
 }
@@ -448,6 +451,14 @@ void arch_send_call_function_single_ipi(int cpu)
        smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
 }
 
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
+{
+       if (is_smp())
+               smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
+}
+#endif
+
 static const char *ipi_types[NR_IPI] = {
 #define S(x,s) [x] = s
        S(IPI_WAKEUP, "CPU wakeup interrupts"),
@@ -456,6 +467,8 @@ static const char *ipi_types[NR_IPI] = {
        S(IPI_CALL_FUNC, "Function call interrupts"),
        S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
        S(IPI_CPU_STOP, "CPU stop interrupts"),
+       S(IPI_IRQ_WORK, "IRQ work interrupts"),
+       S(IPI_COMPLETION, "completion interrupts"),
 };
 
 void show_ipi_list(struct seq_file *p, int prec)
@@ -515,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
                cpu_relax();
 }
 
+static DEFINE_PER_CPU(struct completion *, cpu_completion);
+
+int register_ipi_completion(struct completion *completion, int cpu)
+{
+       per_cpu(cpu_completion, cpu) = completion;
+       return IPI_COMPLETION;
+}
+
+static void ipi_complete(unsigned int cpu)
+{
+       complete(per_cpu(cpu_completion, cpu));
+}
+
 /*
  * Main handler for inter-processor interrupts
  */
@@ -565,6 +591,20 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                irq_exit();
                break;
 
+#ifdef CONFIG_IRQ_WORK
+       case IPI_IRQ_WORK:
+               irq_enter();
+               irq_work_run();
+               irq_exit();
+               break;
+#endif
+
+       case IPI_COMPLETION:
+               irq_enter();
+               ipi_complete(cpu);
+               irq_exit();
+               break;
+
        default:
                printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
                       cpu, ipinr);
index 5bc1a63284e3913a19afc53c71aefc1847e4c667..1aafa0d785eb835dd50d6036a855fc2bbf8ea32e 100644 (file)
@@ -28,7 +28,7 @@
  */
 unsigned int __init scu_get_core_count(void __iomem *scu_base)
 {
-       unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG);
+       unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
        return (ncores & 0x03) + 1;
 }
 
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
 #ifdef CONFIG_ARM_ERRATA_764369
        /* Cortex-A9 only */
        if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
-               scu_ctrl = __raw_readl(scu_base + 0x30);
+               scu_ctrl = readl_relaxed(scu_base + 0x30);
                if (!(scu_ctrl & 1))
-                       __raw_writel(scu_ctrl | 0x1, scu_base + 0x30);
+                       writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
        }
 #endif
 
-       scu_ctrl = __raw_readl(scu_base + SCU_CTRL);
+       scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
        /* already enabled? */
        if (scu_ctrl & 1)
                return;
 
        scu_ctrl |= 1;
-       __raw_writel(scu_ctrl, scu_base + SCU_CTRL);
+       writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
 
        /*
         * Ensure that the data accessed by CPU0 before the SCU was
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
        if (mode > 3 || mode == 1 || cpu > 3)
                return -EINVAL;
 
-       val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
+       val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
        val |= mode;
-       __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu);
+       writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
 
        return 0;
 }
index 83ccca303df83c4a1f40dce35f324153d979ad16..95d063620b76a6f706bccc23635537ed4bceb01a 100644 (file)
@@ -70,6 +70,40 @@ static inline void ipi_flush_bp_all(void *ignored)
        local_flush_bp_all();
 }
 
+#ifdef CONFIG_ARM_ERRATA_798181
+bool (*erratum_a15_798181_handler)(void);
+
+static bool erratum_a15_798181_partial(void)
+{
+       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+       dsb(ish);
+       return false;
+}
+
+static bool erratum_a15_798181_broadcast(void)
+{
+       asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
+       dsb(ish);
+       return true;
+}
+
+void erratum_a15_798181_init(void)
+{
+       unsigned int midr = read_cpuid_id();
+       unsigned int revidr = read_cpuid(CPUID_REVIDR);
+
+       /* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
+       if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
+           (revidr & 0x210) == 0x210) {
+               return;
+       }
+       if (revidr & 0x10)
+               erratum_a15_798181_handler = erratum_a15_798181_partial;
+       else
+               erratum_a15_798181_handler = erratum_a15_798181_broadcast;
+}
+#endif
+
 static void ipi_flush_tlb_a15_erratum(void *arg)
 {
        dmb();
@@ -80,7 +114,6 @@ static void broadcast_tlb_a15_erratum(void)
        if (!erratum_a15_798181())
                return;
 
-       dummy_flush_tlb_a15_erratum();
        smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1);
 }
 
@@ -92,7 +125,6 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
        if (!erratum_a15_798181())
                return;
 
-       dummy_flush_tlb_a15_erratum();
        this_cpu = get_cpu();
        a15_erratum_get_cpumask(this_cpu, mm, &mask);
        smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
index 2985c9f0905d0e8667210e7bcdc0b51765781524..6591e26fc13f4eab5fcd3a7b1292a3ca7cf3bd92 100644 (file)
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
        case CLOCK_EVT_MODE_PERIODIC:
                ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
                        | TWD_TIMER_CONTROL_PERIODIC;
-               __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
+               writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
                        twd_base + TWD_TIMER_LOAD);
                break;
        case CLOCK_EVT_MODE_ONESHOT:
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
                ctrl = 0;
        }
 
-       __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
 }
 
 static int twd_set_next_event(unsigned long evt,
                        struct clock_event_device *unused)
 {
-       unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
+       unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
 
        ctrl |= TWD_TIMER_CONTROL_ENABLE;
 
-       __raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
-       __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
+       writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
 
        return 0;
 }
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
  */
 static int twd_timer_ack(void)
 {
-       if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
-               __raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
+       if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
+               writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
                return 1;
        }
 
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
                waitjiffies += 5;
 
                                 /* enable, no interrupt or reload */
-               __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
+               writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
 
                                 /* maximum value */
-               __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
+               writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
 
                while (get_jiffies_64() < waitjiffies)
                        udelay(10);
 
-               count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
+               count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
 
                twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
 
@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
         * bother with the below.
         */
        if (per_cpu(percpu_setup_called, cpu)) {
-               __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+               writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
                clockevents_register_device(clk);
                enable_percpu_irq(clk->irq, 0);
                return;
@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
         * The following is done once per CPU the first time .setup() is
         * called.
         */
-       __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+       writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
 
        clk->name = "local_timer";
        clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
index 41cf3cbf756de473be3bb1ebb268445aeacadc5d..2835d35234ca459f4d7086a6f811ff35652a6a11 100644 (file)
@@ -10,7 +10,7 @@
 #include <asm/suspend.h>
 #include <asm/tlbflush.h>
 
-extern int __cpu_suspend(unsigned long, int (*)(unsigned long));
+extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
 extern void cpu_resume_mmu(void);
 
 #ifdef CONFIG_MMU
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
        struct mm_struct *mm = current->active_mm;
+       u32 __mpidr = cpu_logical_map(smp_processor_id());
        int ret;
 
        if (!idmap_pgd)
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
         * resume (indicated by a zero return code), we need to switch
         * back to the correct page tables.
         */
-       ret = __cpu_suspend(arg, fn);
+       ret = __cpu_suspend(arg, fn, __mpidr);
        if (ret == 0) {
                cpu_switch_mm(mm->pgd, mm);
                local_flush_bp_all();
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 #else
 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
-       return __cpu_suspend(arg, fn);
+       u32 __mpidr = cpu_logical_map(smp_processor_id());
+       return __cpu_suspend(arg, fn, __mpidr);
 }
 #define        idmap_pgd       NULL
 #endif
index 8fcda140358d94d6056ea200b79b50bb6e7bde81..6125f259b7b5359072b0cd7a07e122fcd2bda4bd 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/unwind.h>
 #include <asm/tls.h>
 #include <asm/system_misc.h>
+#include <asm/opcodes.h>
 
 static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
 
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
 int is_valid_bugaddr(unsigned long pc)
 {
 #ifdef CONFIG_THUMB2_KERNEL
-       unsigned short bkpt;
+       u16 bkpt;
+       u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
 #else
-       unsigned long bkpt;
+       u32 bkpt;
+       u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
 #endif
 
        if (probe_kernel_address((unsigned *)pc, bkpt))
                return 0;
 
-       return bkpt == BUG_INSTR_VALUE;
+       return bkpt == insn;
 }
 
 #endif
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
        if (processor_mode(regs) == SVC_MODE) {
 #ifdef CONFIG_THUMB2_KERNEL
                if (thumb_mode(regs)) {
-                       instr = ((u16 *)pc)[0];
+                       instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
                        if (is_wide_instruction(instr)) {
-                               instr <<= 16;
-                               instr |= ((u16 *)pc)[1];
+                               u16 inst2;
+                               inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
+                               instr = __opcode_thumb32_compose(instr, inst2);
                        }
                } else
 #endif
-                       instr = *(u32 *) pc;
+                       instr = __mem_to_opcode_arm(*(u32 *) pc);
        } else if (thumb_mode(regs)) {
                if (get_user(instr, (u16 __user *)pc))
                        goto die_sig;
+               instr = __mem_to_opcode_thumb16(instr);
                if (is_wide_instruction(instr)) {
                        unsigned int instr2;
                        if (get_user(instr2, (u16 __user *)pc+1))
                                goto die_sig;
-                       instr <<= 16;
-                       instr |= instr2;
+                       instr2 = __mem_to_opcode_thumb16(instr2);
+                       instr = __opcode_thumb32_compose(instr, instr2);
                }
        } else if (get_user(instr, (u32 __user *)pc)) {
+               instr = __mem_to_opcode_arm(instr);
                goto die_sig;
        }
 
index 9c697db2787e2a524cf59db4519f5272fa2918b0..aea7ccb8d3970f236c34feb7d668c67bcb925b01 100644 (file)
@@ -65,7 +65,7 @@ static bool vgic_present;
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
        BUG_ON(preemptible());
-       __get_cpu_var(kvm_arm_running_vcpu) = vcpu;
+       __this_cpu_write(kvm_arm_running_vcpu, vcpu);
 }
 
 /**
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
 {
        BUG_ON(preemptible());
-       return __get_cpu_var(kvm_arm_running_vcpu);
+       return __this_cpu_read(kvm_arm_running_vcpu);
 }
 
 /**
@@ -815,7 +815,7 @@ static void cpu_init_hyp_mode(void *dummy)
 
        boot_pgd_ptr = kvm_mmu_get_boot_httbr();
        pgd_ptr = kvm_mmu_get_httbr();
-       stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
+       stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
        hyp_stack_ptr = stack_page + PAGE_SIZE;
        vector_ptr = (unsigned long)__kvm_hyp_vector;
 
index d6408d1ee543fe5e3ceabbcda01b25efb07676ba..e0c68d5bb7dc25dd3fa93dc0fa1b3899f5b09019 100644 (file)
@@ -10,6 +10,11 @@ UNWIND(      .fnstart        )
        and     r3, r0, #31             @ Get bit offset
        mov     r0, r0, lsr #5
        add     r1, r1, r0, lsl #2      @ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7
+       .arch_extension mp
+       ALT_SMP(W(pldw) [r1])
+       ALT_UP(W(nop))
+#endif
        mov     r3, r2, lsl r3
 1:     ldrex   r2, [r1]
        \instr  r2, r2, r3
index 025f742dd4df6bf79b279babd264980d851f01d5..3e58d710013c3ad9b377fc76e6dad58f377e88a7 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/hardirq.h> /* for in_atomic() */
 #include <linux/gfp.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <asm/current.h>
 #include <asm/page.h>
 
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
                return 0;
 
        pmd = pmd_offset(pud, addr);
-       if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+       if (unlikely(pmd_none(*pmd)))
+               return 0;
+
+       /*
+        * A pmd can be bad if it refers to a HugeTLB or THP page.
+        *
+        * Both THP and HugeTLB pages have the same pmd layout
+        * and should not be manipulated by the pte functions.
+        *
+        * Lock the page table for the destination and check
+        * to see that it's still huge and whether or not we will
+        * need to fault on write, or if we have a splitting THP.
+        */
+       if (unlikely(pmd_thp_or_huge(*pmd))) {
+               ptl = &current->mm->page_table_lock;
+               spin_lock(ptl);
+               if (unlikely(!pmd_thp_or_huge(*pmd)
+                       || pmd_hugewillfault(*pmd)
+                       || pmd_trans_splitting(*pmd))) {
+                       spin_unlock(ptl);
+                       return 0;
+               }
+
+               *ptep = NULL;
+               *ptlp = ptl;
+               return 1;
+       }
+
+       if (unlikely(pmd_bad(*pmd)))
                return 0;
 
        pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
                from += tocopy;
                n -= tocopy;
 
-               pte_unmap_unlock(pte, ptl);
+               if (pte)
+                       pte_unmap_unlock(pte, ptl);
+               else
+                       spin_unlock(ptl);
        }
        if (!atomic)
                up_read(&current->mm->mmap_sem);
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
                addr += tocopy;
                n -= tocopy;
 
-               pte_unmap_unlock(pte, ptl);
+               if (pte)
+                       pte_unmap_unlock(pte, ptl);
+               else
+                       spin_unlock(ptl);
        }
        up_read(&current->mm->mmap_sem);
 
index 1fd2cf097e30fcfc10ce6ff7f200180340464496..eb1fa5c84723df5062fa54a154ef737653fa19b4 100644 (file)
@@ -692,14 +692,14 @@ static void netwinder_led_set(struct led_classdev *cdev,
        unsigned long flags;
        u32 reg;
 
-       spin_lock_irqsave(&nw_gpio_lock, flags);
+       raw_spin_lock_irqsave(&nw_gpio_lock, flags);
        reg = nw_gpio_read();
        if (b != LED_OFF)
                reg &= ~led->mask;
        else
                reg |= led->mask;
        nw_gpio_modify_op(led->mask, reg);
-       spin_unlock_irqrestore(&nw_gpio_lock, flags);
+       raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
 }
 
 static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
@@ -709,9 +709,9 @@ static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
        unsigned long flags;
        u32 reg;
 
-       spin_lock_irqsave(&nw_gpio_lock, flags);
+       raw_spin_lock_irqsave(&nw_gpio_lock, flags);
        reg = nw_gpio_read();
-       spin_unlock_irqrestore(&nw_gpio_lock, flags);
+       raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
 
        return (reg & led->mask) ? LED_OFF : LED_FULL;
 }
index fe98df44579cfa5c22c77ebd53dd9efbee15cf1e..08332d84144052899382d6d2543dc4d754779e6f 100644 (file)
@@ -4,11 +4,12 @@ config ARCH_HIGHBANK
        select ARCH_HAS_CPUFREQ
        select ARCH_HAS_HOLES_MEMORYMODEL
        select ARCH_HAS_OPP
+       select ARCH_SUPPORTS_BIG_ENDIAN
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select ARM_AMBA
        select ARM_ERRATA_764369
        select ARM_ERRATA_775420
-       select ARM_ERRATA_798181
+       select ARM_ERRATA_798181 if SMP
        select ARM_GIC
        select ARM_PSCI
        select ARM_TIMER_SP804
index 30e1ebe3a8916e7f4fb9eb00397c180ea4786ff6..c342dc4e8a45c43097f6c28e1e45ba848e57a7f3 100644 (file)
@@ -1,9 +1,5 @@
 if ARCH_IXP4XX
 
-config ARCH_SUPPORTS_BIG_ENDIAN
-       bool
-       default y
-
 menu "Intel IXP4xx Implementation Options"
 
 comment "IXP4xx Platforms"
index 9eb63d7246023e061ff80a91b2a600f691906c58..5e269d7263cea17b31cfd9c4294a0a3a8ce9d84f 100644 (file)
@@ -1,5 +1,6 @@
 config ARCH_MVEBU
        bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
+       select ARCH_SUPPORTS_BIG_ENDIAN
        select CLKSRC_MMIO
        select COMMON_CLK
        select GENERIC_CLOCKEVENTS
index 5476669ba9056ff80d63fab31d8f266ef25a2652..ee7598fe75db873dc81843610939d189969833a3 100644 (file)
@@ -20,6 +20,8 @@
 #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
 #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
 
+#include <asm/assembler.h>
+
        .text
 /*
  * r0: Coherency fabric base register address
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
        /* Create bit by cpu index */
        mov     r3, #(1 << 24)
        lsl     r1, r3, r1
+ARM_BE8(rev    r1, r1)
 
        /* Add CPU to SMP group - Atomic */
        add     r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
index 8a1b0c96e9ece9a05ba3c78a04e51c5d65c3f240..3dd80df428f7edbf92885011a64141bf472add8c 100644 (file)
 #include <linux/linkage.h>
 #include <linux/init.h>
 
+#include <asm/assembler.h>
+
 /*
  * Armada XP specific entry point for secondary CPUs.
  * We add the CPU to the coherency fabric and then jump to secondary
  * startup
  */
 ENTRY(armada_xp_secondary_startup)
+ ARM_BE8(setend        be )                    @ go BE8 if entered LE
+
        /* Get coherency fabric base physical address */
        adr     r0, 1f
        ldr     r1, [r0]
index e838ba27e443c1dd0b54042c3e4afd882ae22265..c9808c6841526204144e36a273596e6955696dc8 100644 (file)
@@ -512,6 +512,9 @@ static void __init assabet_map_io(void)
         * Its called GPCLKR0 in my SA1110 manual.
         */
        Ser1SDCR0 |= SDCR0_SUS;
+       MSC1 = (MSC1 & ~0xffff) |
+               MSC_NonBrst | MSC_32BitStMem |
+               MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0);
 
        if (!machine_has_neponset())
                sa1100_register_uart_fns(&assabet_port_fns);
diff --git a/arch/arm/mach-sa1100/include/mach/gpio.h b/arch/arm/mach-sa1100/include/mach/gpio.h
deleted file mode 100644 (file)
index 6a9eecf..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * arch/arm/mach-sa1100/include/mach/gpio.h
- *
- * SA1100 GPIO wrappers for arch-neutral GPIO calls
- *
- * Written by Philipp Zabel <philipp.zabel@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#ifndef __ASM_ARCH_SA1100_GPIO_H
-#define __ASM_ARCH_SA1100_GPIO_H
-
-#include <linux/io.h>
-#include <mach/hardware.h>
-#include <asm/irq.h>
-#include <asm-generic/gpio.h>
-
-#define __ARM_GPIOLIB_COMPLEX
-
-static inline int gpio_get_value(unsigned gpio)
-{
-       if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-               return GPLR & GPIO_GPIO(gpio);
-       else
-               return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-       if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-               if (value)
-                       GPSR = GPIO_GPIO(gpio);
-               else
-                       GPCR = GPIO_GPIO(gpio);
-       else
-               __gpio_set_value(gpio, value);
-}
-
-#define gpio_cansleep  __gpio_cansleep
-
-#endif
index 7d9df16f04a2276ccceba5b8315cf296988a2334..c810620db53d60235916e8ecf25c6402f2b984b1 100644 (file)
@@ -13,6 +13,8 @@
 #ifndef _INCLUDE_H3XXX_H_
 #define _INCLUDE_H3XXX_H_
 
+#include "hardware.h" /* Gives GPIO_MAX */
+
 /* Physical memory regions corresponding to chip selects */
 #define H3600_EGPIO_PHYS       (SA1100_CS5_PHYS + 0x01000000)
 #define H3600_BANK_2_PHYS      SA1100_CS2_PHYS
index bcbc94540e4574c605b99a82f4e642db9afda26b..41e476e571d7f8b2be7a78b312fe3d699c8eebe8 100644 (file)
@@ -19,6 +19,7 @@
 
 #include <mach/hardware.h>
 #include <asm/setup.h>
+#include <asm/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
index 0bf04a0bca9d5c4d5d831b730de89cff332bb7b4..09e740f58b274184f0cd22b282def431d037b6d9 100644 (file)
@@ -51,7 +51,7 @@ config ARCH_TEGRA_3x_SOC
 
 config ARCH_TEGRA_114_SOC
        bool "Enable support for Tegra114 family"
-       select ARM_ERRATA_798181
+       select ARM_ERRATA_798181 if SMP
        select ARM_L1_CACHE_SHIFT_6
        select HAVE_ARM_ARCH_TIMER
        select PINCTRL_TEGRA114
index d7e7422527cac791938e4006873bad4fc8883d64..cbbb81e0e5096f7214e178fbed58b746a8dbf41b 100644 (file)
@@ -1,6 +1,7 @@
 config ARCH_VEXPRESS
        bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
        select ARCH_REQUIRE_GPIOLIB
+       select ARCH_SUPPORTS_BIG_ENDIAN
        select ARM_AMBA
        select ARM_GIC
        select ARM_TIMER_SP804
index 3a6384c6c4356129733962ec286a8fae5d83f779..14d499688736b3c816c082f3216aa12503005b64 100644 (file)
@@ -133,38 +133,8 @@ static void dcscb_power_down(void)
        if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
                arch_spin_unlock(&dcscb_lock);
 
-               /*
-                * Flush all cache levels for this cluster.
-                *
-                * To do so we do:
-                * - Clear the SCTLR.C bit to prevent further cache allocations
-                * - Flush the whole cache
-                * - Clear the ACTLR "SMP" bit to disable local coherency
-                *
-                * Let's do it in the safest possible way i.e. with
-                * no memory access within the following sequence
-                * including to the stack.
-                *
-                * Note: fp is preserved to the stack explicitly prior doing
-                * this since adding it to the clobber list is incompatible
-                * with having CONFIG_FRAME_POINTER=y.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_all \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               /* Flush all cache levels for this cluster. */
+               v7_exit_coherency_flush(all);
 
                /*
                 * This is a harmless no-op.  On platforms with a real
@@ -183,26 +153,8 @@ static void dcscb_power_down(void)
        } else {
                arch_spin_unlock(&dcscb_lock);
 
-               /*
-                * Flush the local CPU cache.
-                * Let's do it in the safest possible way as above.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_louis \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               /* Disable and flush the local CPU cache. */
+               v7_exit_coherency_flush(louis);
        }
 
        __mcpm_cpu_down(cpu, cluster);
index e6eb4819291241f30b51d5e7b58c14d1d07c0d32..4eb92ebfd95322e50c1428f7310c8c19e65abd2b 100644 (file)
@@ -156,32 +156,7 @@ static void tc2_pm_down(u64 residency)
                        : : "r" (0x400) );
                }
 
-               /*
-                * We need to disable and flush the whole (L1 and L2) cache.
-                * Let's do it in the safest possible way i.e. with
-                * no memory access within the following sequence
-                * including the stack.
-                *
-                * Note: fp is preserved to the stack explicitly prior doing
-                * this since adding it to the clobber list is incompatible
-                * with having CONFIG_FRAME_POINTER=y.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_all \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               v7_exit_coherency_flush(all);
 
                cci_disable_port_by_cpu(mpidr);
 
@@ -197,26 +172,7 @@ static void tc2_pm_down(u64 residency)
 
                arch_spin_unlock(&tc2_pm_lock);
 
-               /*
-                * We need to disable and flush only the L1 cache.
-                * Let's do it in the safest possible way as above.
-                */
-               asm volatile(
-               "str    fp, [sp, #-4]! \n\t"
-               "mrc    p15, 0, r0, c1, c0, 0   @ get CR \n\t"
-               "bic    r0, r0, #"__stringify(CR_C)" \n\t"
-               "mcr    p15, 0, r0, c1, c0, 0   @ set CR \n\t"
-               "isb    \n\t"
-               "bl     v7_flush_dcache_louis \n\t"
-               "clrex  \n\t"
-               "mrc    p15, 0, r0, c1, c0, 1   @ get AUXCR \n\t"
-               "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t"
-               "mcr    p15, 0, r0, c1, c0, 1   @ set AUXCR \n\t"
-               "isb    \n\t"
-               "dsb    \n\t"
-               "ldr    fp, [sp], #4"
-               : : : "r0","r1","r2","r3","r4","r5","r6","r7",
-                     "r9","r10","lr","memory");
+               v7_exit_coherency_flush(louis);
        }
 
        __mcpm_cpu_down(cpu, cluster);
index cd2c88e7a8f7557bfe299a7a6b364c395a969428..1f8fed94c2a499939354258fc61339a10d229a4f 100644 (file)
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
        help
          This option allows the use of custom mandatory barriers
          included via the mach/barriers.h file.
+
+config ARCH_SUPPORTS_BIG_ENDIAN
+       bool
+       help
+         This option specifies the architecture can support big endian
+         operation.
index 80741992a9fcff0b98d963ec3465bf407c51c5f6..3815a8262af070b98f33d61ac31908961ee06eb2 100644 (file)
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
        bne     do_DataAbort
        bic     r1, r1, #1 << 11                @ clear bit 11 of FSR
        ldr     r3, [r4]                        @ read aborted ARM instruction
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       rev     r3, r3
-#endif
+ ARM_BE8(rev   r3, r3)
+
        do_ldrd_abort tmp=ip, insn=r3
        tst     r3, #1 << 20                    @ L = 0 -> write
        orreq   r1, r1, #1 << 11                @ yes.
index 6f4585b89078749e39383d1aee50f51040624a74..924036473b16f437019002b8afac36a2a41e4010 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/cp15.h>
 #include <asm/system_info.h>
 #include <asm/unaligned.h>
+#include <asm/opcodes.h>
 
 #include "fault.h"
 
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
        if (thumb_mode(regs)) {
                u16 *ptr = (u16 *)(instrptr & ~1);
                fault = probe_kernel_address(ptr, tinstr);
+               tinstr = __mem_to_opcode_thumb16(tinstr);
                if (!fault) {
                        if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
                            IS_T32(tinstr)) {
                                /* Thumb-2 32-bit */
                                u16 tinst2 = 0;
                                fault = probe_kernel_address(ptr + 1, tinst2);
-                               instr = (tinstr << 16) | tinst2;
+                               tinst2 = __mem_to_opcode_thumb16(tinst2);
+                               instr = __opcode_thumb32_compose(tinstr, tinst2);
                                thumb2_32b = 1;
                        } else {
                                isize = 2;
                                instr = thumb2arm(tinstr);
                        }
                }
-       } else
+       } else {
                fault = probe_kernel_address(instrptr, instr);
+               instr = __mem_to_opcode_arm(instr);
+       }
 
        if (fault) {
                type = TYPE_FAULT;
index 644d91f73b00f3c5663b9e273d91d51a50218dd5..79f8b39801a8e570bd86ea80e566ee939bbb600b 100644 (file)
@@ -707,7 +707,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                    gfp_t gfp, struct dma_attrs *attrs)
 {
-       pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+       pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
        void *memory;
 
        if (dma_alloc_from_coherent(dev, size, handle, &memory))
@@ -720,7 +720,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
        dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
+       pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
        void *memory;
 
        if (dma_alloc_from_coherent(dev, size, handle, &memory))
index 9d285626bc7da4fe3e2be2e952c38923c05f5204..312e15e6d00b8a6b909747305166e1bc7f581bf0 100644 (file)
@@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs)
        const struct exception_table_entry *fixup;
 
        fixup = search_exception_tables(instruction_pointer(regs));
-       if (fixup)
+       if (fixup) {
                regs->ARM_pc = fixup->fixup;
+#ifdef CONFIG_THUMB2_KERNEL
+               /* Clear the IT state to avoid nasty surprises in the fixup */
+               regs->ARM_cpsr &= ~PSR_IT_MASK;
+#endif
+       }
 
        return fixup != NULL;
 }
index 83cb3ac27095146f3f60c04047c6b212856a73b2..8e0e52eb76b57d7f9d4208fafbcdf024be369c75 100644 (file)
@@ -10,6 +10,7 @@
 #include <asm/system_info.h>
 
 pgd_t *idmap_pgd;
+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
 
 #ifdef CONFIG_ARM_LPAE
 static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
        unsigned long addr, end;
        unsigned long next;
 
-       addr = virt_to_phys(text_start);
-       end = virt_to_phys(text_end);
+       addr = virt_to_idmap(text_start);
+       end = virt_to_idmap(text_end);
+       pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
 
        prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
 
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
        if (!idmap_pgd)
                return -ENOMEM;
 
-       pr_info("Setting up static identity map for 0x%p - 0x%p\n",
-               __idmap_text_start, __idmap_text_end);
        identity_mapping_add(idmap_pgd, __idmap_text_start,
                             __idmap_text_end, 0);
 
index 0c6356255fe31122f2527a0a2947439e69b0e953..d27158c38eb0b190b869e028b93d8265fb90969e 100644 (file)
@@ -202,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size)
 }
 
 /*
- * We don't use supersection mappings for mmap() on /dev/mem, which
- * means that we can't map the memory area above the 4G barrier into
- * userspace.
+ * Do not allow /dev/mem mappings beyond the supported physical range.
  */
 int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 {
-       return !(pfn + (size >> PAGE_SHIFT) > 0x00100000);
+       return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
 }
 
 #ifdef CONFIG_STRICT_DEVMEM
index b1d17eeb59b895cd429e762d082d6c5c56c3ff57..78eeeca78f5ab331707fcd73b4956c503c2d880b 100644 (file)
@@ -28,6 +28,8 @@
 #include <asm/highmem.h>
 #include <asm/system_info.h>
 #include <asm/traps.h>
+#include <asm/procinfo.h>
+#include <asm/memory.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
        }
 }
 
+#ifdef CONFIG_ARM_LPAE
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+       pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
+       unsigned long map_start, map_end;
+       pgd_t *pgd0, *pgdk;
+       pud_t *pud0, *pudk, *pud_start;
+       pmd_t *pmd0, *pmdk;
+       phys_addr_t phys;
+       int i;
+
+       if (!(mdesc->init_meminfo))
+               return;
+
+       /* remap kernel code and data */
+       map_start = init_mm.start_code;
+       map_end   = init_mm.brk;
+
+       /* get a handle on things... */
+       pgd0 = pgd_offset_k(0);
+       pud_start = pud0 = pud_offset(pgd0, 0);
+       pmd0 = pmd_offset(pud0, 0);
+
+       pgdk = pgd_offset_k(map_start);
+       pudk = pud_offset(pgdk, map_start);
+       pmdk = pmd_offset(pudk, map_start);
+
+       mdesc->init_meminfo();
+
+       /* Run the patch stub to update the constants */
+       fixup_pv_table(&__pv_table_begin,
+               (&__pv_table_end - &__pv_table_begin) << 2);
+
+       /*
+        * Cache cleaning operations for self-modifying code
+        * We should clean the entries by MVA but running a
+        * for loop over every pv_table entry pointer would
+        * just complicate the code.
+        */
+       flush_cache_louis();
+       dsb();
+       isb();
+
+       /* remap level 1 table */
+       for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
+               set_pud(pud0,
+                       __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
+               pmd0 += PTRS_PER_PMD;
+       }
+
+       /* remap pmds for kernel mapping */
+       phys = __pa(map_start) & PMD_MASK;
+       do {
+               *pmdk++ = __pmd(phys | pmdprot);
+               phys += PMD_SIZE;
+       } while (phys < map_end);
+
+       flush_cache_all();
+       cpu_switch_mm(pgd0, &init_mm);
+       cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+       local_flush_bp_all();
+       local_flush_tlb_all();
+}
+
+#else
+
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+       if (mdesc->init_meminfo)
+               mdesc->init_meminfo();
+}
+
+#endif
+
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps, and sets up the zero page, bad page and bad page tables.
index 34d4ab217babb39b563e493cb4d4643963d33a64..5c668b7a31f97e6df35dcec53b79d5a70a9d1d0b 100644 (file)
@@ -295,6 +295,15 @@ void __init sanity_check_meminfo(void)
        high_memory = __va(end - 1) + 1;
 }
 
+/*
+ * early_paging_init() recreates boot time page table setup, allowing machines
+ * to switch over to a high (>4G) address space on LPAE systems
+ */
+void __init early_paging_init(const struct machine_desc *mdesc,
+                             struct proc_info_list *procinfo)
+{
+}
+
 /*
  * paging_init() sets up the page tables, initialises the zone memory
  * maps, and sets up the zero page, bad page and bad page tables.
index 1128064fddcbd6c11754d5c2b4241e8b51c7d6a5..45dc29f85d56ca79ae7fa6076f3cee103465e15b 100644 (file)
@@ -220,9 +220,7 @@ __v6_setup:
 #endif /* CONFIG_MMU */
        adr     r5, v6_crval
        ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
+ ARM_BE8(orr   r6, r6, #1 << 25)               @ big-endian page tables
        mrc     p15, 0, r0, c1, c0, 0           @ read control register
        bic     r0, r0, r5                      @ clear bits them
        orr     r0, r0, r6                      @ set them
index c63d9bdee51e65cab7bbad13632207702db41273..60920f62fdf5994f04477bc94aba09dcd349385a 100644 (file)
@@ -367,9 +367,7 @@ __v7_setup:
 #endif
        adr     r5, v7_crval
        ldmia   r5, {r5, r6}
-#ifdef CONFIG_CPU_ENDIAN_BE8
-       orr     r6, r6, #1 << 25                @ big-endian page tables
-#endif
+ ARM_BE8(orr   r6, r6, #1 << 25)               @ big-endian page tables
 #ifdef CONFIG_SWP_EMULATE
        orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
        bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
index 99b44e0e8d866983dd60d4b8324bf29010679e77..9ed155ad0f97c42c80fd903919a51dac8b0298d4 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
 #include <asm/hwcap.h>
+#include <asm/opcodes.h>
 
 #include "bpf_jit_32.h"
 
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
 
 static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
 {
+       inst |= (cond << 28);
+       inst = __opcode_to_mem_arm(inst);
+
        if (ctx->target != NULL)
-               ctx->target[ctx->idx] = inst | (cond << 28);
+               ctx->target[ctx->idx] = inst;
 
        ctx->idx++;
 }
index 2677bc3762d7ce418b4c116b709dadf0f8e53697..40f27e52de759aed3ab7d3f7e912bf4feab3a358 100644 (file)
@@ -10,6 +10,7 @@
  */
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <asm/assembler.h>
 
 /*
  * Realview/Versatile Express specific entry point for secondary CPUs.
@@ -17,6 +18,7 @@
  * until we're ready for them to initialise.
  */
 ENTRY(versatile_secondary_startup)
+ ARM_BE8(setend        be)
        mrc     p15, 0, r0, c0, c0, 5
        bic     r0, #0xff000000
        adr     r4, 1f
index 52b8f40b1c73d48d206d497a6fadc381b5ddd888..2f37e1d6cb4500f57386800c8d282df85a6431b2 100644 (file)
@@ -642,9 +642,9 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp,
 static int vfp_hotplug(struct notifier_block *b, unsigned long action,
        void *hcpu)
 {
-       if (action == CPU_DYING || action == CPU_DYING_FROZEN) {
-               vfp_force_reload((long)hcpu, current_thread_info());
-       else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
+       if (action == CPU_DYING || action == CPU_DYING_FROZEN)
+               vfp_current_hw_state[(long)hcpu] = NULL;
+       else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
                vfp_enable(NULL);
        return NOTIFY_OK;
 }
index 8363644685711895cbb049c735b6ebb15cf81ff1..01de5aaa3edc112a42548a1b3906e4353bcce949 100644 (file)
@@ -126,20 +126,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
        return oldval;
 }
 
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-       unsigned long tmp, tmp2;
-
-       asm volatile("// atomic_clear_mask\n"
-"1:    ldxr    %0, %2\n"
-"      bic     %0, %0, %3\n"
-"      stxr    %w1, %0, %2\n"
-"      cbnz    %w1, 1b"
-       : "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr)
-       : "Ir" (mask)
-       : "cc");
-}
-
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
index cbfacf7fb4387160862c906c38ad72dea395305c..6a0a9b132d7af11714348f3950990a9854d56ba6 100644 (file)
@@ -27,7 +27,6 @@
 #include <linux/uaccess.h>
 
 #include <asm/debug-monitors.h>
-#include <asm/local.h>
 #include <asm/cputype.h>
 #include <asm/system_misc.h>
 
@@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_disable);
  * Keep track of debug users on each core.
  * The ref counts are per-cpu so we use a local_t type.
  */
-static DEFINE_PER_CPU(local_t, mde_ref_count);
-static DEFINE_PER_CPU(local_t, kde_ref_count);
+static DEFINE_PER_CPU(int, mde_ref_count);
+static DEFINE_PER_CPU(int, kde_ref_count);
 
 void enable_debug_monitors(enum debug_el el)
 {
@@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el el)
 
        WARN_ON(preemptible());
 
-       if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+       if (this_cpu_inc_return(mde_ref_count) == 1)
                enable = DBG_MDSCR_MDE;
 
        if (el == DBG_ACTIVE_EL1 &&
-           local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+           this_cpu_inc_return(kde_ref_count) == 1)
                enable |= DBG_MDSCR_KDE;
 
        if (enable && debug_enabled) {
@@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_el el)
 
        WARN_ON(preemptible());
 
-       if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
+       if (this_cpu_dec_return(mde_ref_count) == 0)
                disable = ~DBG_MDSCR_MDE;
 
        if (el == DBG_ACTIVE_EL1 &&
-           local_dec_and_test(&__get_cpu_var(kde_ref_count)))
+           this_cpu_dec_return(kde_ref_count) == 0)
                disable &= ~DBG_MDSCR_KDE;
 
        if (disable) {
index 329218ca9ffbb74456155a44f827a7231e70da8a..ff516f6691e48b77067cfe789d04e1680e79e15f 100644 (file)
@@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
                /* Breakpoint */
                ctrl_reg = AARCH64_DBG_REG_BCR;
                val_reg = AARCH64_DBG_REG_BVR;
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                max_slots = core_num_brps;
                reg_enable = !debug_info->bps_disabled;
        } else {
                /* Watchpoint */
                ctrl_reg = AARCH64_DBG_REG_WCR;
                val_reg = AARCH64_DBG_REG_WVR;
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                max_slots = core_num_wrps;
                reg_enable = !debug_info->wps_disabled;
        }
@@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
        if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
                /* Breakpoint */
                base = AARCH64_DBG_REG_BCR;
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                max_slots = core_num_brps;
        } else {
                /* Watchpoint */
                base = AARCH64_DBG_REG_WCR;
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                max_slots = core_num_wrps;
        }
 
@@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable)
 
        switch (reg) {
        case AARCH64_DBG_REG_BCR:
-               slots = __get_cpu_var(bp_on_reg);
+               slots = this_cpu_ptr(bp_on_reg);
                max_slots = core_num_brps;
                break;
        case AARCH64_DBG_REG_WCR:
-               slots = __get_cpu_var(wp_on_reg);
+               slots = this_cpu_ptr(wp_on_reg);
                max_slots = core_num_wrps;
                break;
        default:
@@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
        struct debug_info *debug_info;
        struct arch_hw_breakpoint_ctrl ctrl;
 
-       slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+       slots = this_cpu_ptr(bp_on_reg);
        addr = instruction_pointer(regs);
        debug_info = &current->thread.debug;
 
@@ -596,7 +596,7 @@ unlock:
                        user_enable_single_step(current);
        } else {
                toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
-               kernel_step = &__get_cpu_var(stepping_kernel_bp);
+               kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
                if (*kernel_step != ARM_KERNEL_STEP_NONE)
                        return 0;
@@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
        struct arch_hw_breakpoint *info;
        struct arch_hw_breakpoint_ctrl ctrl;
 
-       slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+       slots = this_cpu_ptr(wp_on_reg);
        debug_info = &current->thread.debug;
 
        for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +698,7 @@ unlock:
                        user_enable_single_step(current);
        } else {
                toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
-               kernel_step = &__get_cpu_var(stepping_kernel_bp);
+               kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
                if (*kernel_step != ARM_KERNEL_STEP_NONE)
                        return 0;
@@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
        struct debug_info *debug_info = &current->thread.debug;
        int handled_exception = 0, *kernel_step;
 
-       kernel_step = &__get_cpu_var(stepping_kernel_bp);
+       kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
        /*
         * Called from single-step exception handler.
index 5d14470452ac5959712ef9927b3a093bf8975bf3..0e63c98d224c5756c58ba0a4145c40273bd87e86 100644 (file)
@@ -1044,7 +1044,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
         */
        regs = get_irq_regs();
 
-       cpuc = &__get_cpu_var(cpu_hw_events);
+       cpuc = this_cpu_ptr(&cpu_hw_events);
        for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
                struct perf_event *event = cpuc->events[idx];
                struct hw_perf_event *hwc;
@@ -1258,7 +1258,7 @@ device_initcall(register_pmu_driver);
 
 static struct pmu_hw_events *armpmu_get_cpu_events(void)
 {
-       return &__get_cpu_var(cpu_hw_events);
+       return this_cpu_ptr(&cpu_hw_events);
 }
 
 static void __init cpu_pmu_init(struct arm_pmu *armpmu)
index 69ce573f1224560b4f5c7532e21053b27c651da4..71f337aefa3905feaca892b7ac3b49b4bcb411e3 100644 (file)
@@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
 
          See <http://csrc.nist.gov/encryption/aes/> for more information.
 
+config CRYPTO_AES_ARM_BS
+       tristate "Bit sliced AES using NEON instructions"
+       depends on ARM && KERNEL_MODE_NEON
+       select CRYPTO_ALGAPI
+       select CRYPTO_AES_ARM
+       select CRYPTO_ABLK_HELPER
+       help
+         Use a faster and more secure NEON based implementation of AES in CBC,
+         CTR and XTS modes
+
+         Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
+         and for XTS mode encryption, CBC and XTS mode decryption speedup is
+         around 25%. (CBC encryption speed is not affected by this driver.)
+         This implementation does not rely on any lookup tables so it is
+         believed to be invulnerable to cache timing attacks.
+
 config CRYPTO_ANUBIS
        tristate "Anubis cipher algorithm"
        select CRYPTO_ALGAPI
index bb5b90e8e7687a0b71d33aae92f7050f741a6fa9..b6739cb78e320755e4fb9480c875b6cc2890dd4e 100644 (file)
@@ -852,7 +852,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
 
        /* Enable the CCI port */
 "      ldr     r0, [r0, %[offsetof_port_phys]] \n"
-"      mov     r3, #"__stringify(CCI_ENABLE_REQ)" \n"
+"      mov     r3, %[cci_enable_req]\n"                   
 "      str     r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n"
 
        /* poll the status reg for completion */
@@ -860,7 +860,7 @@ asmlinkage void __naked cci_enable_port_for_self(void)
 "      ldr     r0, [r1] \n"
 "      ldr     r0, [r0, r1]            @ cci_ctrl_base \n"
 "4:    ldr     r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n"
-"      tst     r1, #1 \n"
+"      tst     r1, %[cci_control_status_bits] \n"                      
 "      bne     4b \n"
 
 "      mov     r0, #0 \n"
@@ -873,6 +873,8 @@ asmlinkage void __naked cci_enable_port_for_self(void)
 "7:    .word   cci_ctrl_phys - . \n"
        : :
        [sizeof_cpu_port] "i" (sizeof(cpu_port)),
+       [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ),
+       [cci_control_status_bits] "i" cpu_to_le32(1),
 #ifndef __ARMEB__
        [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)),
 #else
index 8ea3b33d4b40bb4b96ad7edd83f6dac7e219aa03..a90be34e4d5c24569dc64a320e59d0a9474e0908 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/module.h>
-
+#include <linux/io.h>
 #include <mach/hardware.h>
 #include <mach/irqs.h>
 
index d0e948084eaf7e2211c323f5976ecc08e5681136..9031171c141b52c5e9175fdbf6eec9bd0c4224b3 100644 (file)
@@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
        if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
                return -EINVAL;
 
+       raw_spin_lock(&irq_controller_lock);
        mask = 0xff << shift;
        bit = gic_cpu_map[cpu] << shift;
-
-       raw_spin_lock(&irq_controller_lock);
        val = readl_relaxed(reg) & ~mask;
        writel_relaxed(val | bit, reg);
        raw_spin_unlock(&irq_controller_lock);
@@ -652,7 +651,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
 void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 {
        int cpu;
-       unsigned long map = 0;
+       unsigned long flags, map = 0;
+
+       raw_spin_lock_irqsave(&irq_controller_lock, flags);
 
        /* Convert our logical CPU mask into a physical one. */
        for_each_cpu(cpu, mask)
@@ -666,7 +667,149 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 
        /* this always happens on GIC0 */
        writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+
+       raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_BL_SWITCHER
+/*
+ * gic_send_sgi - send a SGI directly to given CPU interface number
+ *
+ * cpu_id: the ID for the destination CPU interface
+ * irq: the IPI number to send a SGI for
+ */
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq)
+{
+       BUG_ON(cpu_id >= NR_GIC_CPU_IF);
+       cpu_id = 1 << cpu_id;
+       /* this always happens on GIC0 */
+       writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+}
+
+/*
+ * gic_get_cpu_id - get the CPU interface ID for the specified CPU
+ *
+ * @cpu: the logical CPU number to get the GIC ID for.
+ *
+ * Return the CPU interface ID for the given logical CPU number,
+ * or -1 if the CPU number is too large or the interface ID is
+ * unknown (more than one bit set).
+ */
+int gic_get_cpu_id(unsigned int cpu)
+{
+       unsigned int cpu_bit;
+
+       if (cpu >= NR_GIC_CPU_IF)
+               return -1;
+       cpu_bit = gic_cpu_map[cpu];
+       if (cpu_bit & (cpu_bit - 1))
+               return -1;
+       return __ffs(cpu_bit);
 }
+
+/*
+ * gic_migrate_target - migrate IRQs to another CPU interface
+ *
+ * @new_cpu_id: the CPU target ID to migrate IRQs to
+ *
+ * Migrate all peripheral interrupts with a target matching the current CPU
+ * to the interface corresponding to @new_cpu_id.  The CPU interface mapping
+ * is also updated.  Targets to other CPU interfaces are unchanged.
+ * This must be called with IRQs locally disabled.
+ */
+void gic_migrate_target(unsigned int new_cpu_id)
+{
+       unsigned int cur_cpu_id, gic_irqs, gic_nr = 0;
+       void __iomem *dist_base;
+       int i, ror_val, cpu = smp_processor_id();
+       u32 val, cur_target_mask, active_mask;
+
+       if (gic_nr >= MAX_GIC_NR)
+               BUG();
+
+       dist_base = gic_data_dist_base(&gic_data[gic_nr]);
+       if (!dist_base)
+               return;
+       gic_irqs = gic_data[gic_nr].gic_irqs;
+
+       cur_cpu_id = __ffs(gic_cpu_map[cpu]);
+       cur_target_mask = 0x01010101 << cur_cpu_id;
+       ror_val = (cur_cpu_id - new_cpu_id) & 31;
+
+       raw_spin_lock(&irq_controller_lock);
+
+       /* Update the target interface for this logical CPU */
+       gic_cpu_map[cpu] = 1 << new_cpu_id;
+
+       /*
+        * Find all the peripheral interrupts targetting the current
+        * CPU interface and migrate them to the new CPU interface.
+        * We skip DIST_TARGET 0 to 7 as they are read-only.
+        */
+       for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) {
+               val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4);
+               active_mask = val & cur_target_mask;
+               if (active_mask) {
+                       val &= ~active_mask;
+                       val |= ror32(active_mask, ror_val);
+                       writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4);
+               }
+       }
+
+       raw_spin_unlock(&irq_controller_lock);
+
+       /*
+        * Now let's migrate and clear any potential SGIs that might be
+        * pending for us (cur_cpu_id).  Since GIC_DIST_SGI_PENDING_SET
+        * is a banked register, we can only forward the SGI using
+        * GIC_DIST_SOFTINT.  The original SGI source is lost but Linux
+        * doesn't use that information anyway.
+        *
+        * For the same reason we do not adjust SGI source information
+        * for previously sent SGIs by us to other CPUs either.
+        */
+       for (i = 0; i < 16; i += 4) {
+               int j;
+               val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i);
+               if (!val)
+                       continue;
+               writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i);
+               for (j = i; j < i + 4; j++) {
+                       if (val & 0xff)
+                               writel_relaxed((1 << (new_cpu_id + 16)) | j,
+                                               dist_base + GIC_DIST_SOFTINT);
+                       val >>= 8;
+               }
+       }
+}
+
+/*
+ * gic_get_sgir_physaddr - get the physical address for the SGI register
+ *
+ * REturn the physical address of the SGI register to be used
+ * by some early assembly code when the kernel is not yet available.
+ */
+static unsigned long gic_dist_physaddr;
+
+unsigned long gic_get_sgir_physaddr(void)
+{
+       if (!gic_dist_physaddr)
+               return 0;
+       return gic_dist_physaddr + GIC_DIST_SOFTINT;
+}
+
+void __init gic_init_physaddr(struct device_node *node)
+{
+       struct resource res;
+       if (of_address_to_resource(node, 0, &res) == 0) {
+               gic_dist_physaddr = res.start;
+               pr_info("GIC physical location is %#lx\n", gic_dist_physaddr);
+       }
+}
+
+#else
+#define gic_init_physaddr(node)  do { } while (0)
 #endif
 
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
@@ -850,6 +993,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
                percpu_offset = 0;
 
        gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+       if (!gic_cnt)
+               gic_init_physaddr(node);
 
        if (parent) {
                irq = irq_of_parse_and_map(node, 0);
index c3785edc0e92c851d3c36a0481a1b2936f92fc63..d135c76c4855b825175370e215979bd528e4b2d4 100644 (file)
@@ -62,6 +62,7 @@ static unsigned int fmax = 515633;
  * @signal_direction: input/out direction of bus signals can be indicated
  * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
  * @busy_detect: true if busy detection on dat0 is supported
+ * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply
  */
 struct variant_data {
        unsigned int            clkreg;
@@ -76,6 +77,7 @@ struct variant_data {
        bool                    signal_direction;
        bool                    pwrreg_clkgate;
        bool                    busy_detect;
+       bool                    pwrreg_nopower;
 };
 
 static struct variant_data variant_arm = {
@@ -109,6 +111,7 @@ static struct variant_data variant_u300 = {
        .pwrreg_powerup         = MCI_PWR_ON,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
+       .pwrreg_nopower         = true,
 };
 
 static struct variant_data variant_nomadik = {
@@ -121,6 +124,7 @@ static struct variant_data variant_nomadik = {
        .pwrreg_powerup         = MCI_PWR_ON,
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
+       .pwrreg_nopower         = true,
 };
 
 static struct variant_data variant_ux500 = {
@@ -135,6 +139,7 @@ static struct variant_data variant_ux500 = {
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
+       .pwrreg_nopower         = true,
 };
 
 static struct variant_data variant_ux500v2 = {
@@ -150,6 +155,7 @@ static struct variant_data variant_ux500v2 = {
        .signal_direction       = true,
        .pwrreg_clkgate         = true,
        .busy_detect            = true,
+       .pwrreg_nopower         = true,
 };
 
 static int mmci_card_busy(struct mmc_host *mmc)
@@ -189,6 +195,21 @@ static int mmci_validate_data(struct mmci_host *host,
        return 0;
 }
 
+static void mmci_reg_delay(struct mmci_host *host)
+{
+       /*
+        * According to the spec, at least three feedback clock cycles
+        * of max 52 MHz must pass between two writes to the MMCICLOCK reg.
+        * Three MCLK clock cycles must pass between two MMCIPOWER reg writes.
+        * Worst delay time during card init is at 100 kHz => 30 us.
+        * Worst delay time when up and running is at 25 MHz => 120 ns.
+        */
+       if (host->cclk < 25000000)
+               udelay(30);
+       else
+               ndelay(120);
+}
+
 /*
  * This must be called with host->lock held
  */
@@ -1264,6 +1285,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 
        mmci_set_clkreg(host, ios->clock);
        mmci_write_pwrreg(host, pwr);
+       mmci_reg_delay(host);
 
        spin_unlock_irqrestore(&host->lock, flags);
 
@@ -1510,23 +1532,6 @@ static int mmci_probe(struct amba_device *dev,
                mmc->f_max = min(host->mclk, fmax);
        dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
 
-       host->pinctrl = devm_pinctrl_get(&dev->dev);
-       if (IS_ERR(host->pinctrl)) {
-               ret = PTR_ERR(host->pinctrl);
-               goto clk_disable;
-       }
-
-       host->pins_default = pinctrl_lookup_state(host->pinctrl,
-                       PINCTRL_STATE_DEFAULT);
-
-       /* enable pins to be muxed in and configured */
-       if (!IS_ERR(host->pins_default)) {
-               ret = pinctrl_select_state(host->pinctrl, host->pins_default);
-               if (ret)
-                       dev_warn(&dev->dev, "could not set default pins\n");
-       } else
-               dev_warn(&dev->dev, "could not get default pinstate\n");
-
        /* Get regulators and the supported OCR mask */
        mmc_regulator_get_supply(mmc);
        if (!mmc->ocr_avail)
@@ -1760,6 +1765,41 @@ static int mmci_resume(struct device *dev)
 #endif
 
 #ifdef CONFIG_PM_RUNTIME
+static void mmci_save(struct mmci_host *host)
+{
+       unsigned long flags;
+
+       if (host->variant->pwrreg_nopower) {
+               spin_lock_irqsave(&host->lock, flags);
+
+               writel(0, host->base + MMCIMASK0);
+               writel(0, host->base + MMCIDATACTRL);
+               writel(0, host->base + MMCIPOWER);
+               writel(0, host->base + MMCICLOCK);
+               mmci_reg_delay(host);
+
+               spin_unlock_irqrestore(&host->lock, flags);
+       }
+
+}
+
+static void mmci_restore(struct mmci_host *host)
+{
+       unsigned long flags;
+
+       if (host->variant->pwrreg_nopower) {
+               spin_lock_irqsave(&host->lock, flags);
+
+               writel(host->clk_reg, host->base + MMCICLOCK);
+               writel(host->datactrl_reg, host->base + MMCIDATACTRL);
+               writel(host->pwr_reg, host->base + MMCIPOWER);
+               writel(MCI_IRQENABLE, host->base + MMCIMASK0);
+               mmci_reg_delay(host);
+
+               spin_unlock_irqrestore(&host->lock, flags);
+       }
+}
+
 static int mmci_runtime_suspend(struct device *dev)
 {
        struct amba_device *adev = to_amba_device(dev);
@@ -1767,6 +1807,8 @@ static int mmci_runtime_suspend(struct device *dev)
 
        if (mmc) {
                struct mmci_host *host = mmc_priv(mmc);
+               pinctrl_pm_select_sleep_state(dev);
+               mmci_save(host);
                clk_disable_unprepare(host->clk);
        }
 
@@ -1781,6 +1823,8 @@ static int mmci_runtime_resume(struct device *dev)
        if (mmc) {
                struct mmci_host *host = mmc_priv(mmc);
                clk_prepare_enable(host->clk);
+               mmci_restore(host);
+               pinctrl_pm_select_default_state(dev);
        }
 
        return 0;
index 69080fab637520af2a9ea8b4a6ba49ae52b0344f..168bc72f7a94a9b662d7c0a97775c781d4d769aa 100644 (file)
@@ -200,10 +200,6 @@ struct mmci_host {
        struct sg_mapping_iter  sg_miter;
        unsigned int            size;
 
-       /* pinctrl handles */
-       struct pinctrl          *pinctrl;
-       struct pinctrl_state    *pins_default;
-
 #ifdef CONFIG_DMA_ENGINE
        /* DMA stuff */
        struct dma_chan         *dma_current;
index 682df0e1954a96718ae1e439db73441681101145..63b5eff0a80f647ffc0c6bdb6064ac56d482fc34 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/resource.h>
 #include <linux/regulator/consumer.h>
 
-#define AMBA_NR_IRQS   2
+#define AMBA_NR_IRQS   9
 #define AMBA_CID       0xb105f00d
 
 struct clk;
index 0e5d9ecdb2b672d901b47f184a4b720e604317e2..cac496b1e279293164066ea0204c87309169bd49 100644 (file)
@@ -31,6 +31,8 @@
 #define GIC_DIST_TARGET                        0x800
 #define GIC_DIST_CONFIG                        0xc00
 #define GIC_DIST_SOFTINT               0xf00
+#define GIC_DIST_SGI_PENDING_CLEAR     0xf10
+#define GIC_DIST_SGI_PENDING_SET       0xf20
 
 #define GICH_HCR                       0x0
 #define GICH_VTR                       0x4
@@ -74,6 +76,11 @@ static inline void gic_init(unsigned int nr, int start,
        gic_init_bases(nr, start, dist, cpu, 0, NULL);
 }
 
+void gic_send_sgi(unsigned int cpu_id, unsigned int irq);
+int gic_get_cpu_id(unsigned int cpu);
+void gic_migrate_target(unsigned int new_cpu_id);
+unsigned long gic_get_sgir_physaddr(void);
+
 #endif /* __ASSEMBLY */
 
 #endif
diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h
new file mode 100644 (file)
index 0000000..f76dd4d
--- /dev/null
@@ -0,0 +1,67 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_CPU_MIGRATE_H
+
+#include <linux/tracepoint.h>
+
+#define __cpu_migrate_proto                    \
+       TP_PROTO(u64 timestamp,                 \
+                u32 cpu_hwid)
+#define __cpu_migrate_args                     \
+       TP_ARGS(timestamp,                      \
+               cpu_hwid)
+
+DECLARE_EVENT_CLASS(cpu_migrate,
+
+       __cpu_migrate_proto,
+       __cpu_migrate_args,
+
+       TP_STRUCT__entry(
+               __field(u64,    timestamp               )
+               __field(u32,    cpu_hwid                )
+       ),
+
+       TP_fast_assign(
+               __entry->timestamp = timestamp;
+               __entry->cpu_hwid = cpu_hwid;
+       ),
+
+       TP_printk("timestamp=%llu cpu_hwid=0x%08lX",
+               (unsigned long long)__entry->timestamp,
+               (unsigned long)__entry->cpu_hwid
+       )
+);
+
+#define __define_cpu_migrate_event(name)               \
+       DEFINE_EVENT(cpu_migrate, cpu_migrate_##name,   \
+               __cpu_migrate_proto,                    \
+               __cpu_migrate_args                      \
+       )
+
+__define_cpu_migrate_event(begin);
+__define_cpu_migrate_event(finish);
+__define_cpu_migrate_event(current);
+
+#undef __define_cpu_migrate
+#undef __cpu_migrate_proto
+#undef __cpu_migrate_args
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING
+
+/*
+ * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate
+ * a whole-cluster migration:
+ */
+#define CPU_MIGRATE_ALL_CPUS 0x80000000U
+#endif
+
+#endif /* _TRACE_POWER_CPU_MIGRATE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE power_cpu_migrate
+#include <trace/define_trace.h>
index 15130b50dfe3be2fea6a975a102283f6d33e4887..fe9b61e322a557b063ecc78eccb9a87c8de73dda 100644 (file)
@@ -2,3 +2,6 @@ ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o
+endif
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
new file mode 100644 (file)
index 0000000..2a1cfde
--- /dev/null
@@ -0,0 +1,54 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include "../../util/types.h"
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1)
+#define PERF_REG_IP    PERF_REG_ARM_PC
+#define PERF_REG_SP    PERF_REG_ARM_SP
+
+static inline const char *perf_reg_name(int id)
+{
+       switch (id) {
+       case PERF_REG_ARM_R0:
+               return "r0";
+       case PERF_REG_ARM_R1:
+               return "r1";
+       case PERF_REG_ARM_R2:
+               return "r2";
+       case PERF_REG_ARM_R3:
+               return "r3";
+       case PERF_REG_ARM_R4:
+               return "r4";
+       case PERF_REG_ARM_R5:
+               return "r5";
+       case PERF_REG_ARM_R6:
+               return "r6";
+       case PERF_REG_ARM_R7:
+               return "r7";
+       case PERF_REG_ARM_R8:
+               return "r8";
+       case PERF_REG_ARM_R9:
+               return "r9";
+       case PERF_REG_ARM_R10:
+               return "r10";
+       case PERF_REG_ARM_FP:
+               return "fp";
+       case PERF_REG_ARM_IP:
+               return "ip";
+       case PERF_REG_ARM_SP:
+               return "sp";
+       case PERF_REG_ARM_LR:
+               return "lr";
+       case PERF_REG_ARM_PC:
+               return "pc";
+       default:
+               return NULL;
+       }
+
+       return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c
new file mode 100644 (file)
index 0000000..da3dc95
--- /dev/null
@@ -0,0 +1,48 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+int unwind__arch_reg_id(int regnum)
+{
+       switch (regnum) {
+       case UNW_ARM_R0:
+               return PERF_REG_ARM_R0;
+       case UNW_ARM_R1:
+               return PERF_REG_ARM_R1;
+       case UNW_ARM_R2:
+               return PERF_REG_ARM_R2;
+       case UNW_ARM_R3:
+               return PERF_REG_ARM_R3;
+       case UNW_ARM_R4:
+               return PERF_REG_ARM_R4;
+       case UNW_ARM_R5:
+               return PERF_REG_ARM_R5;
+       case UNW_ARM_R6:
+               return PERF_REG_ARM_R6;
+       case UNW_ARM_R7:
+               return PERF_REG_ARM_R7;
+       case UNW_ARM_R8:
+               return PERF_REG_ARM_R8;
+       case UNW_ARM_R9:
+               return PERF_REG_ARM_R9;
+       case UNW_ARM_R10:
+               return PERF_REG_ARM_R10;
+       case UNW_ARM_R11:
+               return PERF_REG_ARM_FP;
+       case UNW_ARM_R12:
+               return PERF_REG_ARM_IP;
+       case UNW_ARM_R13:
+               return PERF_REG_ARM_SP;
+       case UNW_ARM_R14:
+               return PERF_REG_ARM_LR;
+       case UNW_ARM_R15:
+               return PERF_REG_ARM_PC;
+       default:
+               pr_err("unwind: invalid reg id %d\n", regnum);
+               return -EINVAL;
+       }
+
+       return -EINVAL;
+}
index 58b2d37ae23a19db3719ebfa29c98a79905a4c49..f5905f2b197d81d7cb2c8e0ee5a3f2cc257d5f20 100644 (file)
@@ -31,6 +31,10 @@ ifeq ($(ARCH),x86_64)
   endif
   NO_PERF_REGS := 0
 endif
+ifeq ($(ARCH),arm)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS = -lunwind -lunwind-arm
+endif
 
 ifeq ($(NO_PERF_REGS),0)
   CFLAGS += -DHAVE_PERF_REGS_SUPPORT
@@ -305,8 +309,7 @@ ifndef NO_LIBELF
   endif # NO_DWARF
 endif # NO_LIBELF
 
-# There's only x86 (both 32 and 64) support for CFI unwind so far
-ifneq ($(ARCH),x86)
+ifeq ($(LIBUNWIND_LIBS),)
   NO_LIBUNWIND := 1
 endif
 
@@ -322,8 +325,13 @@ ifndef NO_LIBUNWIND
   endif
 
   ifneq ($(feature-libunwind), 1)
-    msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99);
+    msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
     NO_LIBUNWIND := 1
+  else
+    ifneq ($(feature-libunwind-debug-frame), 1)
+      msg := $(warning No debug_frame support found in libunwind);
+      CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+    endif
   endif
 endif
 
index c803f17fb9869648c97ba9efe93e3a05b4e4329d..e8e195f49a4ee7695a80b2a4d535fb35a5aac504 100644 (file)
@@ -23,6 +23,7 @@ FILES=                                        \
        test-libpython-version          \
        test-libslang                   \
        test-libunwind                  \
+       test-libunwind-debug-frame      \
        test-on-exit                    \
        test-stackprotector-all         \
        test-stackprotector             \
index 59e7a705e146d4eb8ea029ebf74878411f8e3898..799865b607721e247b847711cf656ef375ada341 100644 (file)
 # include "test-libunwind.c"
 #undef main
 
+#define main main_test_libunwind_debug_frame
+# include "test-libunwind-debug-frame.c"
+#undef main
+
 #define main main_test_libaudit
 # include "test-libaudit.c"
 #undef main
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
new file mode 100644 (file)
index 0000000..0ef8087
--- /dev/null
@@ -0,0 +1,16 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                unw_word_t ip, unw_word_t segbase,
+                                const char *obj_name, unw_word_t start,
+                                unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+       dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+       return 0;
+}
index 2f891f7e70bf9251c849780ec008ded14a2776c9..5390d0b8862a680e147cf52dd8bed22cfbba333f 100644 (file)
@@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
 
 #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
 
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                unw_word_t ip,
+                                unw_word_t segbase,
+                                const char *obj_name, unw_word_t start,
+                                unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
 #define DW_EH_PE_FORMAT_MASK   0x0f    /* format of the encoded value */
 #define DW_EH_PE_APPL_MASK     0x70    /* how the value is to be applied */
 
@@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
        return 0;
 }
 
-static int read_unwind_spec(struct dso *dso, struct machine *machine,
-                           u64 *table_data, u64 *segbase, u64 *fde_count)
+static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
+                                    u64 *table_data, u64 *segbase,
+                                    u64 *fde_count)
 {
        int ret = -EINVAL, fd;
        u64 offset;
@@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
        if (fd < 0)
                return -EINVAL;
 
+       /* Check the .eh_frame section for unwinding info */
        offset = elf_section_offset(fd, ".eh_frame_hdr");
        close(fd);
 
@@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine,
                                          table_data, segbase,
                                          fde_count);
 
-       /* TODO .debug_frame check if eh_frame_hdr fails */
        return ret;
 }
 
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int read_unwind_spec_debug_frame(struct dso *dso,
+                                       struct machine *machine, u64 *offset)
+{
+       int fd = dso__data_fd(dso, machine);
+
+       if (fd < 0)
+               return -EINVAL;
+
+       /* Check the .debug_frame section for unwinding info */
+       *offset = elf_section_offset(fd, ".debug_frame");
+       close(fd);
+
+       if (*offset)
+               return 0;
+
+       return -EINVAL;
+}
+#endif
+
 static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
 {
        struct addr_location al;
@@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
 
        pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
 
-       if (read_unwind_spec(map->dso, ui->machine,
-                            &table_data, &segbase, &fde_count))
-               return -EINVAL;
+       /* Check the .eh_frame section for unwinding info */
+       if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
+                                      &table_data, &segbase, &fde_count)) {
+               memset(&di, 0, sizeof(di));
+               di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
+               di.start_ip = map->start;
+               di.end_ip   = map->end;
+               di.u.rti.segbase    = map->start + segbase;
+               di.u.rti.table_data = map->start + table_data;
+               di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
+                                     / sizeof(unw_word_t);
+               return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                need_unwind_info, arg);
+       }
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+       /* Check the .debug_frame section for unwinding info */
+       if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
+               memset(&di, 0, sizeof(di));
+               dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name,
+                                      map->start, map->end);
+               return dwarf_search_unwind_table(as, ip, &di, pi,
+                                                need_unwind_info, arg);
+       }
+#endif
 
-       memset(&di, 0, sizeof(di));
-       di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
-       di.start_ip = map->start;
-       di.end_ip   = map->end;
-       di.u.rti.segbase    = map->start + segbase;
-       di.u.rti.table_data = map->start + table_data;
-       di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
-                             / sizeof(unw_word_t);
-       return dwarf_search_unwind_table(as, ip, &di, pi,
-                                        need_unwind_info, arg);
+       return -EINVAL;
 }
 
 static int access_fpreg(unw_addr_space_t __maybe_unused as,