diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 95102d386b..9c4787f8ed 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -455,8 +455,8 @@ config ARM_CORTEX_CPU_IS_UP config USE_ARCH_MEMCPY bool "Use an assembly optimized implementation of memcpy" - default y - depends on !ARM64 + default y if !ARM64 + depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400)) help Enable the generation of an optimized version of memcpy. Such an implementation may be faster under some conditions @@ -465,7 +465,7 @@ config USE_ARCH_MEMCPY config SPL_USE_ARCH_MEMCPY bool "Use an assembly optimized implementation of memcpy for SPL" default y if USE_ARCH_MEMCPY - depends on !ARM64 && SPL + depends on SPL help Enable the generation of an optimized version of memcpy. Such an implementation may be faster under some conditions @@ -474,16 +474,43 @@ config SPL_USE_ARCH_MEMCPY config TPL_USE_ARCH_MEMCPY bool "Use an assembly optimized implementation of memcpy for TPL" default y if USE_ARCH_MEMCPY - depends on !ARM64 && TPL + depends on TPL help Enable the generation of an optimized version of memcpy. Such an implementation may be faster under some conditions but may increase the binary size. +config USE_ARCH_MEMMOVE + bool "Use an assembly optimized implementation of memmove" if !ARM64 + default USE_ARCH_MEMCPY if ARM64 + depends on ARM64 + help + Enable the generation of an optimized version of memmove. + Such an implementation may be faster under some conditions + but may increase the binary size. + +config SPL_USE_ARCH_MEMMOVE + bool "Use an assembly optimized implementation of memmove for SPL" if !ARM64 + default SPL_USE_ARCH_MEMCPY if ARM64 + depends on SPL && ARM64 + help + Enable the generation of an optimized version of memmove. + Such an implementation may be faster under some conditions + but may increase the binary size. + +config TPL_USE_ARCH_MEMMOVE + bool "Use an assembly optimized implementation of memmove for TPL" if !ARM64 + default TPL_USE_ARCH_MEMCPY if ARM64 + depends on TPL && ARM64 + help + Enable the generation of an optimized version of memmove. + Such an implementation may be faster under some conditions + but may increase the binary size. + config USE_ARCH_MEMSET bool "Use an assembly optimized implementation of memset" - default y - depends on !ARM64 + default y if !ARM64 + depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400)) help Enable the generation of an optimized version of memset. Such an implementation may be faster under some conditions @@ -492,7 +519,7 @@ config USE_ARCH_MEMSET config SPL_USE_ARCH_MEMSET bool "Use an assembly optimized implementation of memset for SPL" default y if USE_ARCH_MEMSET - depends on !ARM64 && SPL + depends on SPL help Enable the generation of an optimized version of memset. Such an implementation may be faster under some conditions @@ -501,7 +528,7 @@ config SPL_USE_ARCH_MEMSET config TPL_USE_ARCH_MEMSET bool "Use an assembly optimized implementation of memset for TPL" default y if USE_ARCH_MEMSET - depends on !ARM64 && TPL + depends on TPL help Enable the generation of an optimized version of memset. Such an implementation may be faster under some conditions diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index 11eaa34fab..ead3f2c356 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -19,7 +19,11 @@ extern char * strchr(const char * s, int c); #endif extern void * memcpy(void *, const void *, __kernel_size_t); +#if CONFIG_IS_ENABLED(USE_ARCH_MEMMOVE) +#define __HAVE_ARCH_MEMMOVE +#else #undef __HAVE_ARCH_MEMMOVE +#endif extern void * memmove(void *, const void *, __kernel_size_t); #undef __HAVE_ARCH_MEMCHR diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 7f66332715..c48e1f622d 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o endif +ifdef CONFIG_ARM64 +obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o +obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o +else obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o +endif obj-$(CONFIG_SEMIHOSTING) += semihosting.o obj-y += bdinfo.o diff --git a/arch/arm/lib/asmdefs.h b/arch/arm/lib/asmdefs.h new file mode 100644 index 0000000000..d307a3a8a2 --- /dev/null +++ b/arch/arm/lib/asmdefs.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Macros for asm code. + * + * Copyright (c) 2019, Arm Limited. + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +#if defined(__aarch64__) + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#else + +#define END_FILE + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; + +#endif + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +#endif diff --git a/arch/arm/lib/memcpy-arm64.S b/arch/arm/lib/memcpy-arm64.S new file mode 100644 index 0000000000..507054d847 --- /dev/null +++ b/arch/arm/lib/memcpy-arm64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: MIT */ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2020, Arm Limited. + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (memmove) +ENTRY (memcpy) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +END (memcpy) diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S new file mode 100644 index 0000000000..ee9f9a96cf --- /dev/null +++ b/arch/arm/lib/memset-arm64.S @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: MIT */ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012-2021, Arm Limited. + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include +#include "asmdefs.h" + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 + +ENTRY (memset) + PTR_ARG (0) + SIZE_ARG (2) + + /* + * The optimized memset uses the dc opcode, which causes problems + * when the cache is disabled. Let's check if the cache is disabled + * and use a very simple memset implementation in this case. Otherwise + * jump to the optimized version. + */ + switch_el x6, 3f, 2f, 1f +3: mrs x6, sctlr_el3 + b 0f +2: mrs x6, sctlr_el2 + b 0f +1: mrs x6, sctlr_el1 +0: + tst x6, #CR_C + bne 9f + + /* + * A very "simple" memset implementation without the use of the + * dc opcode. Can be run with caches disabled. + */ + mov x3, #0x0 + cmp count, x3 /* check for zero length */ + beq 8f +4: strb valw, [dstin, x3] + add x3, x3, #0x1 + cmp count, x3 + bne 4b +8: ret +9: + + /* Here the optimized memset version starts */ + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + .p2align 4 +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (memset)