riscv: optimized memset

The generic memset is defined as a byte at time write. This is always
safe, but it's slower than a 4 byte or even 8 byte write.

Write a generic memset which fills the data one byte at time until the
destination is aligned, then fills using the largest size allowed,
and finally fills the remaining data one byte at time.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
This commit is contained in:
Matteo Croce 2021-09-29 19:22:34 +02:00 committed by Emil Renner Berthing
parent 0bdabfc1f1
commit 1869f27c9f
7 changed files with 44 additions and 138 deletions

View file

@ -6,13 +6,9 @@
#ifndef _ASM_RISCV_STRING_H
#define _ASM_RISCV_STRING_H
#include <linux/types.h>
#include <linux/linkage.h>
#define __HAVE_ARCH_MEMSET
extern asmlinkage void *memset(void *, int, size_t);
extern asmlinkage void *__memset(void *, int, size_t);
extern void *memset(void *s, int c, size_t count);
extern void *__memset(void *s, int c, size_t count);
#define __HAVE_ARCH_MEMCPY
extern void *memcpy(void *dest, const void *src, size_t count);
extern void *__memcpy(void *dest, const void *src, size_t count);

View file

@ -46,7 +46,6 @@ obj-y += syscall_table.o
obj-y += sys_riscv.o
obj-y += time.o
obj-y += traps.o
obj-y += riscv_ksyms.o
obj-y += stacktrace.o
obj-y += cacheinfo.o
obj-y += patch.o

View file

@ -1,13 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Zihao Yu
*/
#include <linux/export.h>
#include <linux/uaccess.h>
/*
* Assembly functions that may be used (directly or indirectly) by modules
*/
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(__memset);

View file

@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
lib-y += delay.o
lib-y += memset.o
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
lib-y += string.o

View file

@ -1,113 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 Regents of the University of California
*/
#include <linux/linkage.h>
#include <asm/asm.h>
/* void *memset(void *, int, size_t) */
ENTRY(__memset)
WEAK(memset)
move t0, a0 /* Preserve return value */
/* Defer to byte-oriented fill for small sizes */
sltiu a3, a2, 16
bnez a3, 4f
/*
* Round to nearest XLEN-aligned address
* greater than or equal to start address
*/
addi a3, t0, SZREG-1
andi a3, a3, ~(SZREG-1)
beq a3, t0, 2f /* Skip if already aligned */
/* Handle initial misalignment */
sub a4, a3, t0
1:
sb a1, 0(t0)
addi t0, t0, 1
bltu t0, a3, 1b
sub a2, a2, a4 /* Update count */
2: /* Duff's device with 32 XLEN stores per iteration */
/* Broadcast value into all bytes */
andi a1, a1, 0xff
slli a3, a1, 8
or a1, a3, a1
slli a3, a1, 16
or a1, a3, a1
#ifdef CONFIG_64BIT
slli a3, a1, 32
or a1, a3, a1
#endif
/* Calculate end address */
andi a4, a2, ~(SZREG-1)
add a3, t0, a4
andi a4, a4, 31*SZREG /* Calculate remainder */
beqz a4, 3f /* Shortcut if no remainder */
neg a4, a4
addi a4, a4, 32*SZREG /* Calculate initial offset */
/* Adjust start address with offset */
sub t0, t0, a4
/* Jump into loop body */
/* Assumes 32-bit instruction lengths */
la a5, 3f
#ifdef CONFIG_64BIT
srli a4, a4, 1
#endif
add a5, a5, a4
jr a5
3:
REG_S a1, 0(t0)
REG_S a1, SZREG(t0)
REG_S a1, 2*SZREG(t0)
REG_S a1, 3*SZREG(t0)
REG_S a1, 4*SZREG(t0)
REG_S a1, 5*SZREG(t0)
REG_S a1, 6*SZREG(t0)
REG_S a1, 7*SZREG(t0)
REG_S a1, 8*SZREG(t0)
REG_S a1, 9*SZREG(t0)
REG_S a1, 10*SZREG(t0)
REG_S a1, 11*SZREG(t0)
REG_S a1, 12*SZREG(t0)
REG_S a1, 13*SZREG(t0)
REG_S a1, 14*SZREG(t0)
REG_S a1, 15*SZREG(t0)
REG_S a1, 16*SZREG(t0)
REG_S a1, 17*SZREG(t0)
REG_S a1, 18*SZREG(t0)
REG_S a1, 19*SZREG(t0)
REG_S a1, 20*SZREG(t0)
REG_S a1, 21*SZREG(t0)
REG_S a1, 22*SZREG(t0)
REG_S a1, 23*SZREG(t0)
REG_S a1, 24*SZREG(t0)
REG_S a1, 25*SZREG(t0)
REG_S a1, 26*SZREG(t0)
REG_S a1, 27*SZREG(t0)
REG_S a1, 28*SZREG(t0)
REG_S a1, 29*SZREG(t0)
REG_S a1, 30*SZREG(t0)
REG_S a1, 31*SZREG(t0)
addi t0, t0, 32*SZREG
bltu t0, a3, 3b
andi a2, a2, SZREG-1 /* Update count */
4:
/* Handle trailing misalignment */
beqz a2, 6f
add a3, t0, a2
5:
sb a1, 0(t0)
addi t0, t0, 1
bltu t0, a3, 5b
6:
ret
END(__memset)

View file

@ -112,3 +112,44 @@ EXPORT_SYMBOL(__memmove);
void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove);
EXPORT_SYMBOL(memmove);
void *__memset(void *s, int c, size_t count)
{
union types dest = { .as_u8 = s };
if (count >= MIN_THRESHOLD) {
unsigned long cu = (unsigned long)c;
/* Compose an ulong with 'c' repeated 4/8 times */
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
cu *= 0x0101010101010101UL;
#else
cu |= cu << 8;
cu |= cu << 16;
/* Suppress warning on 32 bit machines */
cu |= (cu << 16) << 16;
#endif
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
/*
* Fill the buffer one byte at time until
* the destination is word aligned.
*/
for (; count && dest.as_uptr & WORD_MASK; count--)
*dest.as_u8++ = c;
}
/* Copy using the largest size allowed */
for (; count >= BYTES_LONG; count -= BYTES_LONG)
*dest.as_ulong++ = cu;
}
/* copy the remainder */
while (count--)
*dest.as_u8++ = c;
return s;
}
EXPORT_SYMBOL(__memset);
void *memset(void *s, int c, size_t count) __weak __alias(__memset);
EXPORT_SYMBOL(memset);

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
OBJECT_FILES_NON_STANDARD := y
purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o rvstring.o memset.o
purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o rvstring.o
targets += $(purgatory-y)
PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
@ -15,9 +15,6 @@ $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE
$(obj)/rvstring.o: $(srctree)/arch/riscv/lib/string.c FORCE
$(call if_changed_rule,cc_o_c)
$(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
$(call if_changed_rule,as_o_S)
$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
$(call if_changed_rule,cc_o_c)