[PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

classic Classic list List threaded Threaded
5 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

Wilco Dijkstra-2
Optimize strlen using a mix of scalar and SIMD code. On modern micro
architectures large strings are 2.6 times faster than existing strlen_asimd
and 35% faster than the new MTE version of strlen.

On a random strlen benchmark using small sizes the speedup is 7% vs
strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
regressions on Cortex-A53 and other cores with a simple Neon unit
(see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )

Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
which can hopefully be added soon).

This fixes big-endian bug 25824. Passes GLIBC regression tests.

I'd like this for 2.32 to fix the bug and avoid any regressions.

---

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index a65c554bf3a60ccbed6b519bbbc46aabdf5b6025..4377df0735287c210efd661188f9e6e3923c8003 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -4,5 +4,5 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
    memcpy_new \
    memset_generic memset_falkor memset_emag memset_kunpeng \
    memchr_generic memchr_nosimd \
-   strlen_generic strlen_asimd
+   strlen_mte strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c1df2a012ae17f45f26bd45fc98fe45b2f4d9eb1..1e22fdf8726bf4cd92aed09401b2772f514bf3dc 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -62,7 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   IFUNC_IMPL (i, name, strlen,
       IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
-      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
+      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -26,17 +26,15 @@
 # include <string.h>
 # include <init-arch.h>
 
-#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
+/* This should check HWCAP_MTE when it is available.  */
+#define MTE_ENABLED() (false)
 
 extern __typeof (__redirect_strlen) __strlen;
 
-extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
-libc_ifunc (__strlen,
-    (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
-    ? __strlen_asimd
-    :__strlen_generic));
+libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 236a2c96a6eb5f02b0f0847d230857f0aee87fbe..076a905dceae501d85c1ab59a2250d8305c718f2 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -1,5 +1,4 @@
-/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
-   Copyright (C) 2018-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2020 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -20,80 +19,90 @@
 #include <sysdep.h>
 
 /* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
 
-   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
 
 /* To test the page crossing code path more thoroughly, compile with
    -DTEST_PAGE_CROSS - this will force all calls through the slower
    entry path.  This option is not intended for production use.  */
 
-/* Arguments and results.  */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries.  */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-#define dataq q2
-#define datav v2
-#define datab2 b3
-#define dataq2 q3
-#define datav2 v3
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
 #ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 16
+# define MIN_PAGE_SIZE 32
 #else
 # define MIN_PAGE_SIZE 4096
 #endif
 
- /* Since strings are short on average, we check the first 16 bytes
-   of the string for a NUL character.  In order to do an unaligned load
-   safely we have to do a page cross check first.  If there is a NUL
-   byte we calculate the length from the 2 8-byte words using
-   conditional select to reduce branch mispredictions (it is unlikely
-   strlen_asimd will be repeatedly called on strings with the same
-   length).
-
-   If the string is longer than 16 bytes, we align src so don't need
-   further page cross checks, and process 16 bytes per iteration.
-
-   If the page cross check fails, we read 16 bytes from an aligned
-   address, remove any characters before the string, and continue
-   in the main loop using aligned loads.  Since strings crossing a
-   page in the first 16 bytes are rare (probability of
-   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
-   AArch64 systems have a minimum page size of 4k.  We don't bother
-   checking for larger page sizes - the cost of setting up the correct
-   page size is just not worth the extra gain from a small reduction in
-   the cases taking the slow path.  Note that we only care about
-   whether the first fetch, which may be misaligned, crosses a page
-   boundary.  */
-
-ENTRY_ALIGN (__strlen_asimd, 6)
- DELOUSE (0)
- DELOUSE (1)
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (__strlen_asimd)
+        DELOUSE (0)
+
  and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes.  */
  ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
 #ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+   string is 0x01) means we cannot use has_nul1/2 directly.
+   Since we expect strings to be small and early-exit,
+   byte-swap the data now so has_null1/2 will be correct.  */
  rev data1, data1
  rev data2, data2
 #endif
-
  sub tmp1, data1, zeroones
  orr tmp2, data1, REP8_7f
  sub tmp3, data2, zeroones
@@ -101,78 +110,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)
  bics has_nul1, tmp1, tmp2
  bic has_nul2, tmp3, tmp4
  ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
+
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+   from the string start.  Enter with C = has_nul1 == 0.  */
  csel has_nul1, has_nul1, has_nul2, cc
  mov len, 8
  rev has_nul1, has_nul1
- clz tmp1, has_nul1
  csel len, xzr, len, cc
+ clz tmp1, has_nul1
  add len, len, tmp1, lsr 3
  ret
 
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-
-L(main_loop):
- ldr dataq, [src, 32]!
-L(page_cross_entry):
- /* Get the minimum value and keep going if it is not zero.  */
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbz tmp1, L(tail)
- ldr dataq, [src, 16]
- uminv datab2, datav.16b
- mov tmp1, datav2.d[0]
- cbnz tmp1, L(main_loop)
- add src, src, 16
-
-L(tail):
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
 #ifdef __AARCH64EB__
- rev64 datav.16b, datav.16b
-#endif
- /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
-   pair of scalars and then compute the length from the earliest NULL
-   byte.  */
- cmeq datav.16b, datav.16b, #0
- mov data1, datav.d[0]
- mov data2, datav.d[1]
- cmp data1, 0
- csel data1, data1, data2, ne
- sub len, src, srcin
  rev data1, data1
- add tmp2, len, 8
- clz tmp1, data1
- csel len, len, tmp2, ne
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ b.eq L(loop_entry)
+
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+   the string start.  Enter with C = has_nul1 == 0.  */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 24
+ rev has_nul1, has_nul1
+ mov tmp3, 16
+ clz tmp1, has_nul1
+ csel len, tmp3, len, cc
  add len, len, tmp1, lsr 3
  ret
 
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
-   srcin to 0xff, so we ignore any NUL bytes before the string.
-   Then continue in the aligned loop.  */
-L(page_cross):
- mov tmp3, 63
- bic src, srcin, 15
- and tmp1, srcin, 7
- ands tmp2, srcin, 8
- ldr dataq, [src]
- lsl tmp1, tmp1, 3
- csel tmp2, tmp2, tmp1, eq
- csel tmp1, tmp1, tmp3, eq
- mov tmp4, -1
+L(loop_entry):
+ bic src, srcin, 31
+
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset.  */
 #ifdef __AARCH64EB__
- /* Big-endian.  Early bytes are at MSB.  */
- lsr tmp1, tmp4, tmp1
- lsr tmp2, tmp4, tmp2
+ bic maskv.8h, 0xf0
 #else
- /* Little-endian.  Early bytes are at LSB.  */
- lsl tmp1, tmp4, tmp1
- lsl tmp2, tmp4, tmp2
+ bic maskv.8h, 0x0f, lsl 8
+#endif
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
 #endif
- mov datav2.d[0], tmp1
- mov datav2.d[1], tmp2
- orn datav.16b, datav.16b, datav2.16b
- b L(page_cross_entry)
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
+
+        .p2align 4
+
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
 END (__strlen_asimd)
 weak_alias (__strlen_asimd, strlen_asimd)
 libc_hidden_builtin_def (strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S
similarity index 88%
rename from sysdeps/aarch64/multiarch/strlen_generic.S
rename to sysdeps/aarch64/multiarch/strlen_mte.S
index 61d3f72c9985bdd103d5e4c68337fed4a55511be..b8daa54dd89afbd99a6338cef45f49a25defaa26 100644
--- a/sysdeps/aarch64/multiarch/strlen_generic.S
+++ b/sysdeps/aarch64/multiarch/strlen_mte.S
@@ -17,14 +17,14 @@
    <https://www.gnu.org/licenses/>.  */
 
 /* The actual strlen code is in ../strlen.S.  If we are building libc this file
-   defines __strlen_generic.  Otherwise the include of ../strlen.S will define
+   defines __strlen_mte.  Otherwise the include of ../strlen.S will define
    the normal __strlen  entry points.  */
 
 #include <sysdep.h>
 
 #if IS_IN (libc)
 
-# define STRLEN __strlen_generic
+# define STRLEN __strlen_mte
 
 /* Do not hide the generic version of strlen, we use it internally.  */
 # undef libc_hidden_builtin_def
@@ -32,7 +32,7 @@
 
 # ifdef SHARED
 /* It doesn't make sense to send libc-internal strlen calls through a PLT. */
- .globl __GI_strlen; __GI_strlen = __strlen_generic
+ .globl __GI_strlen; __GI_strlen = __strlen_mte
 # endif
 #endif
 
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

Sourceware - libc-alpha mailing list
On 7/16/20 9:00 AM, Wilco Dijkstra wrote:

> Optimize strlen using a mix of scalar and SIMD code. On modern micro
> architectures large strings are 2.6 times faster than existing strlen_asimd
> and 35% faster than the new MTE version of strlen.
>
> On a random strlen benchmark using small sizes the speedup is 7% vs
> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
> regressions on Cortex-A53 and other cores with a simple Neon unit
> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )
>
> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
> which can hopefully be added soon).
>
> This fixes big-endian bug 25824. Passes GLIBC regression tests.
>
> I'd like this for 2.32 to fix the bug and avoid any regressions.

The copyright/description changes don't look correct.

Overall this is OK for 2.32, but Szabolcs should review this also.
 

> ---
>
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index a65c554bf3a60ccbed6b519bbbc46aabdf5b6025..4377df0735287c210efd661188f9e6e3923c8003 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -4,5 +4,5 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
>     memcpy_new \
>     memset_generic memset_falkor memset_emag memset_kunpeng \
>     memchr_generic memchr_nosimd \
> -   strlen_generic strlen_asimd
> +   strlen_mte strlen_asimd
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index c1df2a012ae17f45f26bd45fc98fe45b2f4d9eb1..1e22fdf8726bf4cd92aed09401b2772f514bf3dc 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -62,7 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    IFUNC_IMPL (i, name, strlen,
>        IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
> -      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
> +      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_mte))
>  
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
> --- a/sysdeps/aarch64/multiarch/strlen.c
> +++ b/sysdeps/aarch64/multiarch/strlen.c
> @@ -26,17 +26,15 @@
>  # include <string.h>
>  # include <init-arch.h>
>  
> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
> +/* This should check HWCAP_MTE when it is available.  */
> +#define MTE_ENABLED() (false)

OK.

>  
>  extern __typeof (__redirect_strlen) __strlen;
>  
> -extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
> +extern __typeof (__redirect_strlen) __strlen_mte attribute_hidden;
>  extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
>  
> -libc_ifunc (__strlen,
> -    (USE_ASIMD_STRLEN () || IS_KUNPENG920 (midr)
> -    ? __strlen_asimd
> -    :__strlen_generic));
> +libc_ifunc (__strlen, (MTE_ENABLED () ? __strlen_mte : __strlen_asimd));

OK.

>  
>  # undef strlen
>  strong_alias (__strlen, strlen);
> diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
> index 236a2c96a6eb5f02b0f0847d230857f0aee87fbe..076a905dceae501d85c1ab59a2250d8305c718f2 100644
> --- a/sysdeps/aarch64/multiarch/strlen_asimd.S
> +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
> @@ -1,5 +1,4 @@
> -/* Strlen implementation that uses ASIMD instructions for load and NULL checks.
> -   Copyright (C) 2018-2020 Free Software Foundation, Inc.

Why are you removing the first line description?

> +/* Copyright (C) 2020 Free Software Foundation, Inc.

This needs justification.

>  
>     This file is part of the GNU C Library.
>  
> @@ -20,80 +19,90 @@
>  #include <sysdep.h>
>  
>  /* Assumptions:
> + *
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + * Not MTE compatible.
> + */
> +
> +#define srcin x0
> +#define len x0
> +
> +#define src x1
> +#define data1 x2
> +#define data2 x3
> +#define has_nul1 x4
> +#define has_nul2 x5
> +#define tmp1 x4
> +#define tmp2 x5
> +#define tmp3 x6
> +#define tmp4 x7
> +#define zeroones x8
> +
> +#define maskv v0
> +#define maskd d0
> +#define dataq1 q1
> +#define dataq2 q2
> +#define datav1 v1
> +#define datav2 v2
> +#define tmp x2
> +#define tmpw w2
> +#define synd x3
> +#define shift x4
> +
> +/* For the first 32 bytes, NUL detection works on the principle that
> +   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
> +   byte is zero, and can be done in parallel across the entire word.  */
>  
> -   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */
> +#define REP8_01 0x0101010101010101
> +#define REP8_7f 0x7f7f7f7f7f7f7f7f
>  
>  /* To test the page crossing code path more thoroughly, compile with
>     -DTEST_PAGE_CROSS - this will force all calls through the slower
>     entry path.  This option is not intended for production use.  */
>  
> -/* Arguments and results.  */
> -#define srcin x0
> -#define len x0
> -
> -/* Locals and temporaries.  */
> -#define src x1
> -#define data1 x2
> -#define data2 x3
> -#define has_nul1 x4
> -#define has_nul2 x5
> -#define tmp1 x4
> -#define tmp2 x5
> -#define tmp3 x6
> -#define tmp4 x7
> -#define zeroones x8
> -#define dataq q2
> -#define datav v2
> -#define datab2 b3
> -#define dataq2 q3
> -#define datav2 v3
> -
> -#define REP8_01 0x0101010101010101
> -#define REP8_7f 0x7f7f7f7f7f7f7f7f
> -
>  #ifdef TEST_PAGE_CROSS
> -# define MIN_PAGE_SIZE 16
> +# define MIN_PAGE_SIZE 32
>  #else
>  # define MIN_PAGE_SIZE 4096
>  #endif
>  
> - /* Since strings are short on average, we check the first 16 bytes
> -   of the string for a NUL character.  In order to do an unaligned load
> -   safely we have to do a page cross check first.  If there is a NUL
> -   byte we calculate the length from the 2 8-byte words using
> -   conditional select to reduce branch mispredictions (it is unlikely
> -   strlen_asimd will be repeatedly called on strings with the same
> -   length).
> -
> -   If the string is longer than 16 bytes, we align src so don't need
> -   further page cross checks, and process 16 bytes per iteration.
> -
> -   If the page cross check fails, we read 16 bytes from an aligned
> -   address, remove any characters before the string, and continue
> -   in the main loop using aligned loads.  Since strings crossing a
> -   page in the first 16 bytes are rare (probability of
> -   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> -
> -   AArch64 systems have a minimum page size of 4k.  We don't bother
> -   checking for larger page sizes - the cost of setting up the correct
> -   page size is just not worth the extra gain from a small reduction in
> -   the cases taking the slow path.  Note that we only care about
> -   whether the first fetch, which may be misaligned, crosses a page
> -   boundary.  */
> -
> -ENTRY_ALIGN (__strlen_asimd, 6)
> - DELOUSE (0)
> - DELOUSE (1)
> +/* Core algorithm:
> +
> +   Since strings are short on average, we check the first 32 bytes of the
> +   string for a NUL character without aligning the string.  In order to use
> +   unaligned loads safely we must do a page cross check first.
> +
> +   If there is a NUL byte we calculate the length from the 2 8-byte words
> +   using conditional select to reduce branch mispredictions (it is unlikely
> +   strlen will be repeatedly called on strings with the same length).
> +
> +   If the string is longer than 32 bytes, align src so we don't need further
> +   page cross checks, and process 32 bytes per iteration using a fast SIMD
> +   loop.
> +
> +   If the page cross check fails, we read 32 bytes from an aligned address,
> +   and ignore any characters before the string.  If it contains a NUL
> +   character, return the length, if not, continue in the main loop.  */
> +
> +ENTRY (__strlen_asimd)
> +        DELOUSE (0)
> +
>   and tmp1, srcin, MIN_PAGE_SIZE - 1
> - mov zeroones, REP8_01
> - cmp tmp1, MIN_PAGE_SIZE - 16
> - b.gt L(page_cross)
> + cmp tmp1, MIN_PAGE_SIZE - 32
> + b.hi L(page_cross)
> +
> + /* Look for a NUL byte in the first 16 bytes.  */
>   ldp data1, data2, [srcin]
> + mov zeroones, REP8_01
> +
>  #ifdef __AARCH64EB__
> + /* For big-endian, carry propagation (if the final byte in the
> +   string is 0x01) means we cannot use has_nul1/2 directly.
> +   Since we expect strings to be small and early-exit,
> +   byte-swap the data now so has_null1/2 will be correct.  */
>   rev data1, data1
>   rev data2, data2
>  #endif
> -
>   sub tmp1, data1, zeroones
>   orr tmp2, data1, REP8_7f
>   sub tmp3, data2, zeroones
> @@ -101,78 +110,105 @@ ENTRY_ALIGN (__strlen_asimd, 6)
>   bics has_nul1, tmp1, tmp2
>   bic has_nul2, tmp3, tmp4
>   ccmp has_nul2, 0, 0, eq
> - beq L(main_loop_entry)
> + b.eq L(bytes16_31)
> +
> + /* Find the exact offset of the first NUL byte in the first 16 bytes
> +   from the string start.  Enter with C = has_nul1 == 0.  */
>   csel has_nul1, has_nul1, has_nul2, cc
>   mov len, 8
>   rev has_nul1, has_nul1
> - clz tmp1, has_nul1
>   csel len, xzr, len, cc
> + clz tmp1, has_nul1
>   add len, len, tmp1, lsr 3
>   ret
>  
> -L(main_loop_entry):
> - bic src, srcin, 15
> - sub src, src, 16
> -
> -L(main_loop):
> - ldr dataq, [src, 32]!
> -L(page_cross_entry):
> - /* Get the minimum value and keep going if it is not zero.  */
> - uminv datab2, datav.16b
> - mov tmp1, datav2.d[0]
> - cbz tmp1, L(tail)
> - ldr dataq, [src, 16]
> - uminv datab2, datav.16b
> - mov tmp1, datav2.d[0]
> - cbnz tmp1, L(main_loop)
> - add src, src, 16
> -
> -L(tail):
> + .p2align 3
> + /* Look for a NUL byte at offset 16..31 in the string.  */
> +L(bytes16_31):
> + ldp data1, data2, [srcin, 16]
>  #ifdef __AARCH64EB__
> - rev64 datav.16b, datav.16b
> -#endif
> - /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
> -   pair of scalars and then compute the length from the earliest NULL
> -   byte.  */
> - cmeq datav.16b, datav.16b, #0
> - mov data1, datav.d[0]
> - mov data2, datav.d[1]
> - cmp data1, 0
> - csel data1, data1, data2, ne
> - sub len, src, srcin
>   rev data1, data1
> - add tmp2, len, 8
> - clz tmp1, data1
> - csel len, len, tmp2, ne
> + rev data2, data2
> +#endif
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, REP8_7f
> + sub tmp3, data2, zeroones
> + orr tmp4, data2, REP8_7f
> + bics has_nul1, tmp1, tmp2
> + bic has_nul2, tmp3, tmp4
> + ccmp has_nul2, 0, 0, eq
> + b.eq L(loop_entry)
> +
> + /* Find the exact offset of the first NUL byte at offset 16..31 from
> +   the string start.  Enter with C = has_nul1 == 0.  */
> + csel has_nul1, has_nul1, has_nul2, cc
> + mov len, 24
> + rev has_nul1, has_nul1
> + mov tmp3, 16
> + clz tmp1, has_nul1
> + csel len, tmp3, len, cc
>   add len, len, tmp1, lsr 3
>   ret
>  
> - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> -   srcin to 0xff, so we ignore any NUL bytes before the string.
> -   Then continue in the aligned loop.  */
> -L(page_cross):
> - mov tmp3, 63
> - bic src, srcin, 15
> - and tmp1, srcin, 7
> - ands tmp2, srcin, 8
> - ldr dataq, [src]
> - lsl tmp1, tmp1, 3
> - csel tmp2, tmp2, tmp1, eq
> - csel tmp1, tmp1, tmp3, eq
> - mov tmp4, -1
> +L(loop_entry):
> + bic src, srcin, 31
> +
> + .p2align 5
> +L(loop):
> + ldp dataq1, dataq2, [src, 32]!
> + uminp maskv.16b, datav1.16b, datav2.16b
> + uminp maskv.16b, maskv.16b, maskv.16b
> + cmeq maskv.8b, maskv.8b, 0
> + fmov synd, maskd
> + cbz synd, L(loop)
> +
> + /* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
> + cmeq maskv.16b, datav1.16b, 0
> + sub len, src, srcin
> + tst synd, 0xffffffff
> + b.ne 1f
> + cmeq maskv.16b, datav2.16b, 0
> + add len, len, 16
> +1:
> + /* Generate a bitmask and compute correct byte offset.  */
>  #ifdef __AARCH64EB__
> - /* Big-endian.  Early bytes are at MSB.  */
> - lsr tmp1, tmp4, tmp1
> - lsr tmp2, tmp4, tmp2
> + bic maskv.8h, 0xf0
>  #else
> - /* Little-endian.  Early bytes are at LSB.  */
> - lsl tmp1, tmp4, tmp1
> - lsl tmp2, tmp4, tmp2
> + bic maskv.8h, 0x0f, lsl 8
> +#endif
> + umaxp maskv.16b, maskv.16b, maskv.16b
> + fmov synd, maskd
> +#ifndef __AARCH64EB__
> + rbit synd, synd
>  #endif
> - mov datav2.d[0], tmp1
> - mov datav2.d[1], tmp2
> - orn datav.16b, datav.16b, datav2.16b
> - b L(page_cross_entry)
> + clz tmp, synd
> + add len, len, tmp, lsr 2
> + ret
> +
> +        .p2align 4
> +
> +L(page_cross):
> + bic src, srcin, 31
> + mov tmpw, 0x0c03
> + movk tmpw, 0xc030, lsl 16
> + ld1 {datav1.16b, datav2.16b}, [src]
> + dup maskv.4s, tmpw
> + cmeq datav1.16b, datav1.16b, 0
> + cmeq datav2.16b, datav2.16b, 0
> + and datav1.16b, datav1.16b, maskv.16b
> + and datav2.16b, datav2.16b, maskv.16b
> + addp maskv.16b, datav1.16b, datav2.16b
> + addp maskv.16b, maskv.16b, maskv.16b
> + fmov synd, maskd
> + lsl shift, srcin, 1
> + lsr synd, synd, shift
> + cbz synd, L(loop)
> +
> + rbit synd, synd
> + clz len, synd
> + lsr len, len, 1
> + ret
> +
>  END (__strlen_asimd)
>  weak_alias (__strlen_asimd, strlen_asimd)
>  libc_hidden_builtin_def (strlen_asimd)
> diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_mte.S
> similarity index 88%
> rename from sysdeps/aarch64/multiarch/strlen_generic.S
> rename to sysdeps/aarch64/multiarch/strlen_mte.S
> index 61d3f72c9985bdd103d5e4c68337fed4a55511be..b8daa54dd89afbd99a6338cef45f49a25defaa26 100644
> --- a/sysdeps/aarch64/multiarch/strlen_generic.S
> +++ b/sysdeps/aarch64/multiarch/strlen_mte.S

OK. Rename.

> @@ -17,14 +17,14 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  /* The actual strlen code is in ../strlen.S.  If we are building libc this file
> -   defines __strlen_generic.  Otherwise the include of ../strlen.S will define
> +   defines __strlen_mte.  Otherwise the include of ../strlen.S will define

OK.

>     the normal __strlen  entry points.  */
>  
>  #include <sysdep.h>
>  
>  #if IS_IN (libc)
>  
> -# define STRLEN __strlen_generic
> +# define STRLEN __strlen_mte
>  
>  /* Do not hide the generic version of strlen, we use it internally.  */
>  # undef libc_hidden_builtin_def
> @@ -32,7 +32,7 @@
>  
>  # ifdef SHARED
>  /* It doesn't make sense to send libc-internal strlen calls through a PLT. */
> - .globl __GI_strlen; __GI_strlen = __strlen_generic
> + .globl __GI_strlen; __GI_strlen = __strlen_mte
>  # endif
>  #endif
>  
>
>


--
Cheers,
Carlos.

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

Szabolcs Nagy-2
The 07/16/2020 11:31, Carlos O'Donell wrote:

> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:
> > Optimize strlen using a mix of scalar and SIMD code. On modern micro
> > architectures large strings are 2.6 times faster than existing strlen_asimd
> > and 35% faster than the new MTE version of strlen.
> >
> > On a random strlen benchmark using small sizes the speedup is 7% vs
> > strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
> > regressions on Cortex-A53 and other cores with a simple Neon unit
> > (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )
> >
> > Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
> > MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
> > which can hopefully be added soon).
> >
> > This fixes big-endian bug 25824. Passes GLIBC regression tests.
> >
> > I'd like this for 2.32 to fix the bug and avoid any regressions.
>
> The copyright/description changes don't look correct.
>
> Overall this is OK for 2.32, but Szabolcs should review this also.
...

> > diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
> > index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
> > --- a/sysdeps/aarch64/multiarch/strlen.c
> > +++ b/sysdeps/aarch64/multiarch/strlen.c
> > @@ -26,17 +26,15 @@
> >  # include <string.h>
> >  # include <init-arch.h>
> >  
> > -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
> > +/* This should check HWCAP_MTE when it is available.  */
> > +#define MTE_ENABLED() (false)
>
> OK.

i'd like glibc 2.32 to support heap tagging via malloc
interposition (on future linux + future hw).

MTE_ENABLED==false in the ifunc dispatch prevents that.
(we select non-mte-safe strlen)

is adding the HWCAP value right before release OK?
i need to discuss with linux devs if we can reserve
the value ahead of time.

the patch is OK with the current logic, i will try to deal
with this issue separately.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

Sourceware - libc-alpha mailing list
On 7/16/20 12:52 PM, Szabolcs Nagy wrote:

> The 07/16/2020 11:31, Carlos O'Donell wrote:
>> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:
>>> Optimize strlen using a mix of scalar and SIMD code. On modern micro
>>> architectures large strings are 2.6 times faster than existing strlen_asimd
>>> and 35% faster than the new MTE version of strlen.
>>>
>>> On a random strlen benchmark using small sizes the speedup is 7% vs
>>> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
>>> regressions on Cortex-A53 and other cores with a simple Neon unit
>>> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )
>>>
>>> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
>>> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
>>> which can hopefully be added soon).
>>>
>>> This fixes big-endian bug 25824. Passes GLIBC regression tests.
>>>
>>> I'd like this for 2.32 to fix the bug and avoid any regressions.
>>
>> The copyright/description changes don't look correct.
>>
>> Overall this is OK for 2.32, but Szabolcs should review this also.
> ...
>>> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
>>> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
>>> --- a/sysdeps/aarch64/multiarch/strlen.c
>>> +++ b/sysdeps/aarch64/multiarch/strlen.c
>>> @@ -26,17 +26,15 @@
>>>  # include <string.h>
>>>  # include <init-arch.h>
>>>  
>>> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
>>> +/* This should check HWCAP_MTE when it is available.  */
>>> +#define MTE_ENABLED() (false)
>>
>> OK.
>
> i'd like glibc 2.32 to support heap tagging via malloc
> interposition (on future linux + future hw).
>
> MTE_ENABLED==false in the ifunc dispatch prevents that.
> (we select non-mte-safe strlen)
>
> is adding the HWCAP value right before release OK?
> i need to discuss with linux devs if we can reserve
> the value ahead of time.

glibc would obviously like to see that HWCAP value finalized
and in a released kernel so it doesn't change.
 
> the patch is OK with the current logic, i will try to deal
> with this issue separately.
 
I assume this means you accept the patch as-is?

It's clearer if people provided a "Reviewed-by:" line in cases
like this, that way they indicate a clear intent that the patch
is reviewed and substantial issues solved.

--
Cheers,
Carlos.

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] AArch64: Improve strlen_asimd performance (bug 25824)

Szabolcs Nagy-2
The 07/16/2020 14:28, Carlos O'Donell wrote:

> On 7/16/20 12:52 PM, Szabolcs Nagy wrote:
> > The 07/16/2020 11:31, Carlos O'Donell wrote:
> >> On 7/16/20 9:00 AM, Wilco Dijkstra wrote:
> >>> Optimize strlen using a mix of scalar and SIMD code. On modern micro
> >>> architectures large strings are 2.6 times faster than existing strlen_asimd
> >>> and 35% faster than the new MTE version of strlen.
> >>>
> >>> On a random strlen benchmark using small sizes the speedup is 7% vs
> >>> strlen_asimd and 40% vs the MTE strlen.  This fixes the main strlen
> >>> regressions on Cortex-A53 and other cores with a simple Neon unit
> >>> (see https://sourceware.org/pipermail/libc-alpha/2020-June/114641.html )
> >>>
> >>> Rename __strlen_generic to __strlen_mte, and select strlen_asimd when
> >>> MTE is not enabled (this is waiting on support for a HWCAP_MTE bit
> >>> which can hopefully be added soon).
> >>>
> >>> This fixes big-endian bug 25824. Passes GLIBC regression tests.
> >>>
> >>> I'd like this for 2.32 to fix the bug and avoid any regressions.
> >>
> >> The copyright/description changes don't look correct.
> >>
> >> Overall this is OK for 2.32, but Szabolcs should review this also.
> > ...
> >>> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
> >>> index 99f2cf2cde54fd1158383d097ba51edc1377f55b..7c0352dd878086708ac785807bc4d210b85e528f 100644
> >>> --- a/sysdeps/aarch64/multiarch/strlen.c
> >>> +++ b/sysdeps/aarch64/multiarch/strlen.c
> >>> @@ -26,17 +26,15 @@
> >>>  # include <string.h>
> >>>  # include <init-arch.h>
> >>>  
> >>> -#define USE_ASIMD_STRLEN() IS_FALKOR (midr)
> >>> +/* This should check HWCAP_MTE when it is available.  */
> >>> +#define MTE_ENABLED() (false)
> >>
> >> OK.
> >
> > i'd like glibc 2.32 to support heap tagging via malloc
> > interposition (on future linux + future hw).
> >
> > MTE_ENABLED==false in the ifunc dispatch prevents that.
> > (we select non-mte-safe strlen)
> >
> > is adding the HWCAP value right before release OK?
> > i need to discuss with linux devs if we can reserve
> > the value ahead of time.
>
> glibc would obviously like to see that HWCAP value finalized
> and in a released kernel so it doesn't change.
>  
> > the patch is OK with the current logic, i will try to deal
> > with this issue separately.
>  
> I assume this means you accept the patch as-is?
>
> It's clearer if people provided a "Reviewed-by:" line in cases
> like this, that way they indicate a clear intent that the patch
> is reviewed and substantial issues solved.

OK with the description header fix.

Reviewed-by: Szabolcs Nagy <[hidden email]>