[PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

classic Classic list List threaded Threaded
13 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Feng Xue OS
This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
to control using DC ZVA in memset or not. Only when memset size exceeds
this threshold, DC ZVA will be used.

The background is that DC ZVA does not always outperform normal
memory-store zeroing, especially when there are multiple processes/threads
contending for memory/cache.

Feng
----

    * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
    * sysdeps/aarch64/dl-tunables.list (glibc):
    Add memset.dc_zva_threshold.
    * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
    dl-tunables.h
    (INIT_ZVA_THRESHOLD): New macro, with new local variable
    zva_threshold.
    (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
    * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
    New variable.
    (init_memset): New macro.
    * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
    Add conditional compare over __memset_dc_zva_threshold.
    * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
    new value.
---
 ChangeLog                                 | 18 ++++++++++++++++++
 manual/tunables.texi                      |  9 +++++++++
 sysdeps/aarch64/dl-tunables.list          |  6 ++++++
 sysdeps/aarch64/multiarch/init-arch.h     | 11 +++++++++++
 sysdeps/aarch64/multiarch/memset.c        | 22 +++++++++++++++++++---
 sysdeps/aarch64/multiarch/memset_base64.S |  7 +++++++
 sysdeps/aarch64/multiarch/memset_emag.S   |  4 ++--
 7 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index dbdb85d..1921e2a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2019-07-26  Feng Xue  <[hidden email]>
+
+ * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
+ * sysdeps/aarch64/dl-tunables.list (glibc):
+ Add memset.dc_zva_threshold.
+ * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
+ dl-tunables.h
+ (INIT_ZVA_THRESHOLD): New macro, with new local variable
+ zva_threshold.
+ (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
+ * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
+ New variable.
+ (init_memset): New macro.
+ * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
+ Add conditional compare over __memset_dc_zva_threshold.
+ * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
+ new value.
+
 2019-07-25  Florian Weimer  <[hidden email]>
 
  [BZ #24677]
diff --git a/manual/tunables.texi b/manual/tunables.texi
index ee0fdf2..c7c13cc 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -411,3 +411,12 @@ instead.
 
 This tunable is specific to i386 and x86-64.
 @end deftp
+
+@deftp Tunable glibc.memset.dc_zva_threshold
+The @code{glibc.memset.dc_zva_threshold} tunable allows the user to set
+threshold to trigger DC ZVA in memset. When memset size is less than this
+threshold, normal memory store instruction will be used, otherwise DC ZVA
+instruction will be used. Value of zero means default threshold.
+
+This tunable is specific to aarch64.
+@end deftp
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
index 5fac533..0f5b5e1 100644
--- a/sysdeps/aarch64/dl-tunables.list
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -22,4 +22,10 @@ glibc {
       type: STRING
     }
   }
+  memset {
+    dc_zva_threshold {
+      type: SIZE_T
+      default: 0
+    }
+  }
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index b9020ae..93133a2 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,7 +18,18 @@
 
 #include <ldsodefs.h>
 
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+
+# define INIT_ZVA_THRESHOLD()      \
+  uint64_t __attribute__((unused)) zva_threshold =      \
+    TUNABLE_GET(glibc, memset, dc_zva_threshold, size_t, NULL);
+#else
+# define INIT_ZVA_THRESHOLD()
+#endif
+
 #define INIT_ARCH()      \
+  INIT_ZVA_THRESHOLD()      \
   uint64_t __attribute__((unused)) midr =      \
     GLRO(dl_aarch64_cpu_features).midr_el1;      \
   unsigned __attribute__((unused)) zva_size =      \
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 4817587..2015bce 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -32,12 +32,28 @@ extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 
+# if HAVE_TUNABLES
+uint64_t __memset_dc_zva_threshold = 512;
+
+#  define init_memset(fn, default_zva_threshold) \
+({ \
+  if (zva_threshold) \
+    __memset_dc_zva_threshold = zva_threshold; \
+  else if (default_zva_threshold) \
+    __memset_dc_zva_threshold = default_zva_threshold; \
+  fn; \
+})
+# else
+#  define init_memset(fn, default_zva_threshold)  (fn)
+# endif
+
 libc_ifunc (__libc_memset,
+
     ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
-     ? __memset_falkor
+     ? init_memset (__memset_falkor, 0)
      : (IS_EMAG (midr) && zva_size == 64
-       ? __memset_emag
-       : __memset_generic)));
+       ? init_memset (__memset_emag, 8*1024*1024)
+       : init_memset (__memset_generic, 0))));
 
 # undef memset
 strong_alias (__libc_memset, memset);
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..6350a6d 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -91,7 +91,14 @@ L(set96):
  .p2align 4
 L(set_long):
  stp val, val, [dstin]
+#if HAVE_TUNABLES
+ adrp tmp1, __memset_dc_zva_threshold
+ add tmp1, tmp1, :lo12:__memset_dc_zva_threshold
+ ldr tmp2, [tmp1] /* Load DC ZVA tunable threshold value. */
+ cmp count, tmp2
+#else
  cmp count, DC_ZVA_THRESHOLD
+#endif
  ccmp val, 0, 0, cs
  bic dst, dstin, 15
  b.eq L(zva_64)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..78a2a14 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -23,10 +23,10 @@
 /*
  * Using dc zva to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
+ * processes/threads contending memory/cache. Here we use a very
  * large threshold to trigger usage of dc zva.
 */
-# define DC_ZVA_THRESHOLD 1024
+# define DC_ZVA_THRESHOLD (8*1024*1024)
 
 # include "./memset_base64.S"
 #endif
--
1.8.3.1

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Szabolcs Nagy-2
On 26/07/2019 12:58, Feng Xue OS wrote:
> This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
> to control using DC ZVA in memset or not. Only when memset size exceeds
> this threshold, DC ZVA will be used.
>
> The background is that DC ZVA does not always outperform normal
> memory-store zeroing, especially when there are multiple processes/threads
> contending for memory/cache.

adding a threshold to memset_emag is fine, but
i'm not yet convinced that a tunable threshold
is useful enough.

is it expected that different workloads require
different setting? is this effect significant?
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Feng Xue OS
Yes. For multiple parallel threading workload on emag, we can get an obvious
improvement if using a large threshold to trigger DC ZVA or even disable it, while
for single threading, situation is reversed. Since there is no smart way to identify
workload characteristic at runtime, we propose to introduce this tunable.

Feng

________________________________________
From: Szabolcs Nagy <[hidden email]>
Sent: Friday, July 26, 2019 11:17:39 PM
To: Feng Xue OS; [hidden email]
Cc: nd
Subject: Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

On 26/07/2019 12:58, Feng Xue OS wrote:
> This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
> to control using DC ZVA in memset or not. Only when memset size exceeds
> this threshold, DC ZVA will be used.
>
> The background is that DC ZVA does not always outperform normal
> memory-store zeroing, especially when there are multiple processes/threads
> contending for memory/cache.

adding a threshold to memset_emag is fine, but
i'm not yet convinced that a tunable threshold
is useful enough.

is it expected that different workloads require
different setting? is this effect significant?
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Siddhesh Poyarekar-8
In reply to this post by Feng Xue OS
On 26/07/19 5:28 PM, Feng Xue OS wrote:

> This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
> to control using DC ZVA in memset or not. Only when memset size exceeds
> this threshold, DC ZVA will be used.
>
> The background is that DC ZVA does not always outperform normal
> memory-store zeroing, especially when there are multiple processes/threads
> contending for memory/cache.
>
> Feng
> ----
>
>     * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
>     * sysdeps/aarch64/dl-tunables.list (glibc):
>     Add memset.dc_zva_threshold.
>     * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
>     dl-tunables.h
>     (INIT_ZVA_THRESHOLD): New macro, with new local variable
>     zva_threshold.
>     (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
>     * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
>     New variable.
>     (init_memset): New macro.
>     * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
>     Add conditional compare over __memset_dc_zva_threshold.
>     * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
>     new value.
> ---
>  ChangeLog                                 | 18 ++++++++++++++++++
>  manual/tunables.texi                      |  9 +++++++++
>  sysdeps/aarch64/dl-tunables.list          |  6 ++++++
>  sysdeps/aarch64/multiarch/init-arch.h     | 11 +++++++++++
>  sysdeps/aarch64/multiarch/memset.c        | 22 +++++++++++++++++++---
>  sysdeps/aarch64/multiarch/memset_base64.S |  7 +++++++
>  sysdeps/aarch64/multiarch/memset_emag.S   |  4 ++--
>  7 files changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/ChangeLog b/ChangeLog
> index dbdb85d..1921e2a 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,21 @@
> +2019-07-26  Feng Xue  <[hidden email]>
> +
> + * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
> + * sysdeps/aarch64/dl-tunables.list (glibc):
> + Add memset.dc_zva_threshold.
> + * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
> + dl-tunables.h
> + (INIT_ZVA_THRESHOLD): New macro, with new local variable
> + zva_threshold.
> + (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
> + * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
> + New variable.
> + (init_memset): New macro.
> + * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
> + Add conditional compare over __memset_dc_zva_threshold.
> + * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
> + new value.
> +
>  2019-07-25  Florian Weimer  <[hidden email]>
>  
>   [BZ #24677]
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index ee0fdf2..c7c13cc 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -411,3 +411,12 @@ instead.
>  
>  This tunable is specific to i386 and x86-64.
>  @end deftp
> +
> +@deftp Tunable glibc.memset.dc_zva_threshold
> +The @code{glibc.memset.dc_zva_threshold} tunable allows the user to set
> +threshold to trigger DC ZVA in memset. When memset size is less than this
> +threshold, normal memory store instruction will be used, otherwise DC ZVA
> +instruction will be used. Value of zero means default threshold.
> +
> +This tunable is specific to aarch64.
> +@end deftp
> diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
> index 5fac533..0f5b5e1 100644
> --- a/sysdeps/aarch64/dl-tunables.list
> +++ b/sysdeps/aarch64/dl-tunables.list
> @@ -22,4 +22,10 @@ glibc {
>        type: STRING
>      }
>    }
> +  memset {
> +    dc_zva_threshold {
> +      type: SIZE_T
> +      default: 0
> +    }
> +  }
>  }

This should be called cache.aarch64_dc_zva_threshold or
cache.aarch64_dczva_threshold.

> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index b9020ae..93133a2 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,7 +18,18 @@
>  
>  #include <ldsodefs.h>
>  
> +#if HAVE_TUNABLES
> +# include <elf/dl-tunables.h>
> +
> +# define INIT_ZVA_THRESHOLD()      \
> +  uint64_t __attribute__((unused)) zva_threshold =      \
> +    TUNABLE_GET(glibc, memset, dc_zva_threshold, size_t, NULL);
> +#else
> +# define INIT_ZVA_THRESHOLD()
> +#endif
> +
>  #define INIT_ARCH()      \
> +  INIT_ZVA_THRESHOLD()      \
>    uint64_t __attribute__((unused)) midr =      \
>      GLRO(dl_aarch64_cpu_features).midr_el1;      \
>    unsigned __attribute__((unused)) zva_size =      \
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 4817587..2015bce 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -32,12 +32,28 @@ extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
>  
> +# if HAVE_TUNABLES
> +uint64_t __memset_dc_zva_threshold = 512;
> +
> +#  define init_memset(fn, default_zva_threshold) \
> +({ \
> +  if (zva_threshold) \
> +    __memset_dc_zva_threshold = zva_threshold; \
> +  else if (default_zva_threshold) \
> +    __memset_dc_zva_threshold = default_zva_threshold; \
> +  fn; \
> +})
> +# else
> +#  define init_memset(fn, default_zva_threshold)  (fn)
> +# endif
> +
>  libc_ifunc (__libc_memset,
> +
>      ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
> -     ? __memset_falkor
> +     ? init_memset (__memset_falkor, 0)
>       : (IS_EMAG (midr) && zva_size == 64
> -       ? __memset_emag
> -       : __memset_generic)));
> +       ? init_memset (__memset_emag, 8*1024*1024)
> +       : init_memset (__memset_generic, 0))));

The default threshold initialization needs to take place in the emag
file and not here, this code is already getting complicated and it won't
be long until it starts looking like a character soup.  That should also
take care of the unexplained magic number (8M).

>  
>  # undef memset
>  strong_alias (__libc_memset, memset);
> diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
> index 9a62325..6350a6d 100644
> --- a/sysdeps/aarch64/multiarch/memset_base64.S
> +++ b/sysdeps/aarch64/multiarch/memset_base64.S
> @@ -91,7 +91,14 @@ L(set96):
>   .p2align 4
>  L(set_long):
>   stp val, val, [dstin]
> +#if HAVE_TUNABLES
> + adrp tmp1, __memset_dc_zva_threshold
> + add tmp1, tmp1, :lo12:__memset_dc_zva_threshold
> + ldr tmp2, [tmp1] /* Load DC ZVA tunable threshold value. */
> + cmp count, tmp2
> +#else
>   cmp count, DC_ZVA_THRESHOLD
> +#endif
>   ccmp val, 0, 0, cs
>   bic dst, dstin, 15
>   b.eq L(zva_64)
> diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
> index 1c1fabc..78a2a14 100644
> --- a/sysdeps/aarch64/multiarch/memset_emag.S
> +++ b/sysdeps/aarch64/multiarch/memset_emag.S
> @@ -23,10 +23,10 @@
>  /*
>   * Using dc zva to zero memory does not produce better performance if
>   * memory size is not very large, especially when there are multiple
> - * processes/threads contending memory/cache. Here we use a somewhat
> + * processes/threads contending memory/cache. Here we use a very
>   * large threshold to trigger usage of dc zva.
>  */
> -# define DC_ZVA_THRESHOLD 1024
> +# define DC_ZVA_THRESHOLD (8*1024*1024)
>  
>  # include "./memset_base64.S"
>  #endif
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Siddhesh Poyarekar-8
I should also add that my review comments do not necessarily mean that I
endorse the tunable.  Can you please post additional test results on at
least one other aarch64 PE to show that the tunable is useful there?
That might make your case for a tunable much stronger.

Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Feng Xue OS
In reply to this post by Siddhesh Poyarekar-8
For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.


> This should be called cache.aarch64_dc_zva_threshold or
> cache.aarch64_dczva_threshold.
I think dc_zva implies aarch64 architecture, so the name "cache.dc_zva_threshold"
seems to be concise a little bit.

> The default threshold initialization needs to take place in the emag
> file and not here, this code is already getting complicated and it won't
> be long until it starts looking like a character soup.  That should also
> take care of the unexplained magic number (8M).
Changed, and added comment for this default value, please refer to new patch below.

Thanks,
Feng

--------------
    * manual/tunables.texi: Document glibc.cache.dc_zva_threshold.
    * sysdeps/aarch64/dl-tunables.list (glibc):
    Add cache.dc_zva_threshold.
    * sysdeps/aarch64/multiarch/memset_emag.c: New file.
    * sysdeps/aarch64/multiarch/memset_base64.S (__memset_base64) : Add
    conditional compare over __dc_zva_threshold.
    * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Change
    to a new value.
    (HAVE_THRESHOLD_TUNABLE): New macro.
---
 ChangeLog                                 | 12 +++++++++
 manual/tunables.texi                      |  9 +++++++
 sysdeps/aarch64/dl-tunables.list          |  6 +++++
 sysdeps/aarch64/multiarch/memset.c        |  2 ++
 sysdeps/aarch64/multiarch/memset_base64.S |  7 ++++++
 sysdeps/aarch64/multiarch/memset_emag.S   | 15 +++++++----
 sysdeps/aarch64/multiarch/memset_emag.c   | 41 +++++++++++++++++++++++++++++++
 7 files changed, 87 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_emag.c

diff --git a/ChangeLog b/ChangeLog
index dbdb85d..7626606 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2019-07-31  Feng Xue  <[hidden email]>
+
+       * manual/tunables.texi: Document glibc.cache.dc_zva_threshold.
+       * sysdeps/aarch64/dl-tunables.list (glibc):
+       Add cache.dc_zva_threshold.
+       * sysdeps/aarch64/multiarch/memset_emag.c: New file.
+       * sysdeps/aarch64/multiarch/memset_base64.S (__memset_base64) : Add
+       conditional compare over __dc_zva_threshold.
+       * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Change
+       to a new value.
+       (HAVE_THRESHOLD_TUNABLE): New macro.
+
 2019-07-25  Florian Weimer  <[hidden email]>

        [BZ #24677]
diff --git a/manual/tunables.texi b/manual/tunables.texi
index ee0fdf2..fb304d0 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -411,3 +411,12 @@ instead.

 This tunable is specific to i386 and x86-64.
 @end deftp
+
+@deftp Tunable glibc.cache.dc_zva_threshold
+The @code{glibc.cache.dc_zva_threshold} tunable allows the user to set
+threshold to trigger DC ZVA in memset on emag processor. When memset size
+is less than this threshold, normal memory store instruction will be used,
+otherwise DC ZVA instruction will be used.
+
+This tunable is specific to emag aarch64.
+@end deftp
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
index 5fac533..5d4bffe 100644
--- a/sysdeps/aarch64/dl-tunables.list
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -22,4 +22,10 @@ glibc {
       type: STRING
     }
   }
+  cache {
+    dc_zva_threshold {
+      type: SIZE_T
+      default: 0
+    }
+  }
 }
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 4817587..792d7f6 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -41,4 +41,6 @@ libc_ifunc (__libc_memset,

 # undef memset
 strong_alias (__libc_memset, memset);
+
+# include "./memset_emag.c"
 #endif
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..748e321 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -91,7 +91,14 @@ L(set96):
        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
+#ifdef HAVE_THRESHOLD_TUNABLE
+       adrp    tmp1, __dc_zva_threshold
+       add     tmp1, tmp1, :lo12:__dc_zva_threshold
+       ldr     tmp2, [tmp1]    /* Load DC ZVA tunable threshold value. */
+       cmp     count, tmp2
+#else
        cmp     count, DC_ZVA_THRESHOLD
+#endif
        ccmp    val, 0, 0, cs
        bic     dst, dstin, 15
        b.eq    L(zva_64)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..a566e40 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -20,13 +20,18 @@
 #if IS_IN (libc)
 # define MEMSET __memset_emag

+# if HAVE_TUNABLES
+#  define HAVE_THRESHOLD_TUNABLE 1
+# endif
+
 /*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we use a very
+ * large threshold to trigger usage of DC ZVA, which is good for
+ * multi-process/thread workloads.
+ */
+# define DC_ZVA_THRESHOLD 8*1024*1024

 # include "./memset_base64.S"
 #endif
diff --git a/sysdeps/aarch64/multiarch/memset_emag.c b/sysdeps/aarch64/multiarch/memset_emag.c
new file mode 100644
index 0000000..8b83ab9
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_emag.c
@@ -0,0 +1,41 @@
+/* Setup threshold to trigger DC ZVA in memset for emag.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+
+/* We assume common workloads on server are likely to consist of multiple
+ * processes/threads, contending memory/cache. For this scenario, disabling
+ * DC ZVA in memset can archive better performance on emag processor.
+ * Therefore, by default, we use a very larget threshold, here is 8M,
+ * which has similar effect as disabling DC ZVA, kind of optimization
+ * for multi-process/thread workload.
+ */
+uint64_t __dc_zva_threshold = 8 * 1024 * 1024;
+
+static void
+__attribute__ ((constructor))
+init_dc_zva_threshold (void)
+{
+  uint64_t threshold
+    = TUNABLE_GET (glibc, cache, dc_zva_threshold, uint64_t, NULL);
+
+  if (threshold)
+    __dc_zva_threshold = threshold;
+}
+#endif
--
1.8.3.1

________________________________________
From: Siddhesh Poyarekar <[hidden email]>
Sent: Monday, July 29, 2019 11:45:48 AM
To: Feng Xue OS; [hidden email]
Subject: Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

On 26/07/19 5:28 PM, Feng Xue OS wrote:

> This patch is composed to add a tunable 'glibc.memset.dc_zva_threshold'
> to control using DC ZVA in memset or not. Only when memset size exceeds
> this threshold, DC ZVA will be used.
>
> The background is that DC ZVA does not always outperform normal
> memory-store zeroing, especially when there are multiple processes/threads
> contending for memory/cache.
>
> Feng
> ----
>
>     * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
>     * sysdeps/aarch64/dl-tunables.list (glibc):
>     Add memset.dc_zva_threshold.
>     * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
>     dl-tunables.h
>     (INIT_ZVA_THRESHOLD): New macro, with new local variable
>     zva_threshold.
>     (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
>     * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
>     New variable.
>     (init_memset): New macro.
>     * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
>     Add conditional compare over __memset_dc_zva_threshold.
>     * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
>     new value.
> ---
>  ChangeLog                                 | 18 ++++++++++++++++++
>  manual/tunables.texi                      |  9 +++++++++
>  sysdeps/aarch64/dl-tunables.list          |  6 ++++++
>  sysdeps/aarch64/multiarch/init-arch.h     | 11 +++++++++++
>  sysdeps/aarch64/multiarch/memset.c        | 22 +++++++++++++++++++---
>  sysdeps/aarch64/multiarch/memset_base64.S |  7 +++++++
>  sysdeps/aarch64/multiarch/memset_emag.S   |  4 ++--
>  7 files changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/ChangeLog b/ChangeLog
> index dbdb85d..1921e2a 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,21 @@
> +2019-07-26  Feng Xue  <[hidden email]>
> +
> +     * manual/tunables.texi: Document glibc.memset.dc_zva_threshold.
> +     * sysdeps/aarch64/dl-tunables.list (glibc):
> +     Add memset.dc_zva_threshold.
> +     * sysdeps/aarch64/multiarch/init-arch.h [HAVE_TUNABLES]: Include
> +     dl-tunables.h
> +     (INIT_ZVA_THRESHOLD): New macro, with new local variable
> +     zva_threshold.
> +     (INIT_ARCH): Add INIT_ZVA_THRESHOLD.
> +     * sysdeps/aarch64/multiarch/memset.c (__memset_dc_zva_threshold):
> +     New variable.
> +     (init_memset): New macro.
> +     * sysdeps/aarch64/memset_base64.S (__memset_base64) [HAVE_TUNABLES]:
> +     Add conditional compare over __memset_dc_zva_threshold.
> +     * sysdeps/aarch64/memset_emag.S (DC_ZVA_THRESHOLD): Changed to a
> +     new value.
> +
>  2019-07-25  Florian Weimer  <[hidden email]>
>
>       [BZ #24677]
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index ee0fdf2..c7c13cc 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -411,3 +411,12 @@ instead.
>
>  This tunable is specific to i386 and x86-64.
>  @end deftp
> +
> +@deftp Tunable glibc.memset.dc_zva_threshold
> +The @code{glibc.memset.dc_zva_threshold} tunable allows the user to set
> +threshold to trigger DC ZVA in memset. When memset size is less than this
> +threshold, normal memory store instruction will be used, otherwise DC ZVA
> +instruction will be used. Value of zero means default threshold.
> +
> +This tunable is specific to aarch64.
> +@end deftp
> diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
> index 5fac533..0f5b5e1 100644
> --- a/sysdeps/aarch64/dl-tunables.list
> +++ b/sysdeps/aarch64/dl-tunables.list
> @@ -22,4 +22,10 @@ glibc {
>        type: STRING
>      }
>    }
> +  memset {
> +    dc_zva_threshold {
> +      type: SIZE_T
> +      default: 0
> +    }
> +  }
>  }

This should be called cache.aarch64_dc_zva_threshold or
cache.aarch64_dczva_threshold.

> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index b9020ae..93133a2 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,7 +18,18 @@
>
>  #include <ldsodefs.h>
>
> +#if HAVE_TUNABLES
> +# include <elf/dl-tunables.h>
> +
> +# define INIT_ZVA_THRESHOLD()                                                      \
> +  uint64_t __attribute__((unused)) zva_threshold =                         \
> +    TUNABLE_GET(glibc, memset, dc_zva_threshold, size_t, NULL);
> +#else
> +# define INIT_ZVA_THRESHOLD()
> +#endif
> +
>  #define INIT_ARCH()                                                        \
> +  INIT_ZVA_THRESHOLD()                                                             \
>    uint64_t __attribute__((unused)) midr =                                  \
>      GLRO(dl_aarch64_cpu_features).midr_el1;                                \
>    unsigned __attribute__((unused)) zva_size =                                      \
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 4817587..2015bce 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -32,12 +32,28 @@ extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
>
> +# if HAVE_TUNABLES
> +uint64_t __memset_dc_zva_threshold = 512;
> +
> +#  define init_memset(fn, default_zva_threshold)             \
> +({                                                           \
> +  if (zva_threshold)                                         \
> +    __memset_dc_zva_threshold = zva_threshold;                       \
> +  else if (default_zva_threshold)                            \
> +    __memset_dc_zva_threshold = default_zva_threshold;               \
> +  fn;                                                                \
> +})
> +# else
> +#  define init_memset(fn, default_zva_threshold)  (fn)
> +# endif
> +
>  libc_ifunc (__libc_memset,
> +
>           ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
> -          ? __memset_falkor
> +          ? init_memset (__memset_falkor, 0)
>            : (IS_EMAG (midr) && zva_size == 64
> -            ? __memset_emag
> -            : __memset_generic)));
> +            ? init_memset (__memset_emag, 8*1024*1024)
> +            : init_memset (__memset_generic, 0))));

The default threshold initialization needs to take place in the emag
file and not here, this code is already getting complicated and it won't
be long until it starts looking like a character soup.  That should also
take care of the unexplained magic number (8M).

>
>  # undef memset
>  strong_alias (__libc_memset, memset);
> diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
> index 9a62325..6350a6d 100644
> --- a/sysdeps/aarch64/multiarch/memset_base64.S
> +++ b/sysdeps/aarch64/multiarch/memset_base64.S
> @@ -91,7 +91,14 @@ L(set96):
>       .p2align 4
>  L(set_long):
>       stp     val, val, [dstin]
> +#if HAVE_TUNABLES
> +     adrp    tmp1, __memset_dc_zva_threshold
> +     add     tmp1, tmp1, :lo12:__memset_dc_zva_threshold
> +     ldr     tmp2, [tmp1]    /* Load DC ZVA tunable threshold value. */
> +     cmp     count, tmp2
> +#else
>       cmp     count, DC_ZVA_THRESHOLD
> +#endif
>       ccmp    val, 0, 0, cs
>       bic     dst, dstin, 15
>       b.eq    L(zva_64)
> diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
> index 1c1fabc..78a2a14 100644
> --- a/sysdeps/aarch64/multiarch/memset_emag.S
> +++ b/sysdeps/aarch64/multiarch/memset_emag.S
> @@ -23,10 +23,10 @@
>  /*
>   * Using dc zva to zero memory does not produce better performance if
>   * memory size is not very large, especially when there are multiple
> - * processes/threads contending memory/cache. Here we use a somewhat
> + * processes/threads contending memory/cache. Here we use a very
>   * large threshold to trigger usage of dc zva.
>  */
> -# define DC_ZVA_THRESHOLD 1024
> +# define DC_ZVA_THRESHOLD (8*1024*1024)
>
>  # include "./memset_base64.S"
>  #endif
>

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Siddhesh Poyarekar-8
On 02/08/19 7:19 AM, Feng Xue OS wrote:
> For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
> improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.

That's great, can you test for another part too?  Making a case for a
tunable is easier if you can show applicability to a larger set of
processors.

>> This should be called cache.aarch64_dc_zva_threshold or
>> cache.aarch64_dczva_threshold.
> I think dc_zva implies aarch64 architecture, so the name "cache.dc_zva_threshold"
> seems to be concise a little bit.

It's not just about whether the meaning is clear, it is about naming
convention.  Not having an architecture name in the tunable implies that
it could be generally applicable.

Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Siddhesh Poyarekar-8
On 02/08/19 8:37 AM, Siddhesh Poyarekar wrote:
> On 02/08/19 7:19 AM, Feng Xue OS wrote:
>> For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
>> improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.
>
> That's great, can you test for another part too?  Making a case for a
> tunable is easier if you can show applicability to a larger set of
> processors.

Oops, I assumed this explanation was for introducing the tunable, I
realize now that it is for the default 8M value on emag.  The test is
still desirable to make the case for a tunable stronger, but not for
this specific point :)

Siddhesh

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Feng Xue OS
>> That's great, can you test for another part too?  Making a case for a
>> tunable is easier if you can show applicability to a larger set of
>> processors.

> Oops, I assumed this explanation was for introducing the tunable, I
> realize now that it is for the default 8M value on emag.  The test is
> still desirable to make the case for a tunable stronger, but not for
> this specific point :)

I still hope this tuning on dc zva can work for other aarch64 processors.
Since we focus on emag, and got no other aarch64 machines on hand,
Then, if someone of other aarch64 is willing to test this, that would be better.

> It's not just about whether the meaning is clear, it is about naming
> convention.  Not having an architecture name in the tunable implies that
> it could be generally applicable.
Ok. Changed.

Thanks,
Feng

--------
    * manual/tunables.texi: Document glibc.cache.aarch64_dczva_threshold.
    * sysdeps/aarch64/dl-tunables.list (glibc):
    Add cache.aarch64_dczva_threshold.
    * sysdeps/aarch64/multiarch/memset_emag.c: New file.
    * sysdeps/aarch64/multiarch/memset_base64.S (DC_ZVA_THRESHOLD) :
    Rename to DCZVA_THRESHOLD.
    * (__memset_base64) : Add conditional compare over __dczva_threshold
    enclosed by new macro HAVE_DCZVA_THRESHOLD_TUNABLE.
    * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Rename
    to DCZVA_THRESHOLD, and change to a new value.
    (HAVE_DCZVA_THRESHOLD_TUNABLE): New macro.
---
 ChangeLog                                 | 14 +++++++++++
 manual/tunables.texi                      |  9 +++++++
 sysdeps/aarch64/dl-tunables.list          |  6 +++++
 sysdeps/aarch64/multiarch/memset.c        |  2 ++
 sysdeps/aarch64/multiarch/memset_base64.S | 13 +++++++---
 sysdeps/aarch64/multiarch/memset_emag.S   | 15 +++++++----
 sysdeps/aarch64/multiarch/memset_emag.c   | 41 +++++++++++++++++++++++++++++++
 7 files changed, 92 insertions(+), 8 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset_emag.c

diff --git a/ChangeLog b/ChangeLog
index dbdb85d..94c55e3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2019-07-31  Feng Xue  <[hidden email]>
+
+       * manual/tunables.texi: Document glibc.cache.aarch64_dczva_threshold.
+       * sysdeps/aarch64/dl-tunables.list (glibc):
+       Add cache.aarch64_dczva_threshold.
+       * sysdeps/aarch64/multiarch/memset_emag.c: New file.
+       * sysdeps/aarch64/multiarch/memset_base64.S (DC_ZVA_THRESHOLD) :
+       Rename to DCZVA_THRESHOLD.
+       * (__memset_base64) : Add conditional compare over __dczva_threshold
+       enclosed by new macro HAVE_DCZVA_THRESHOLD_TUNABLE.
+       * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Rename
+       to DCZVA_THRESHOLD, and change to a new value.
+       (HAVE_DCZVA_THRESHOLD_TUNABLE): New macro.
+
 2019-07-25  Florian Weimer  <[hidden email]>

        [BZ #24677]
diff --git a/manual/tunables.texi b/manual/tunables.texi
index ee0fdf2..b248739 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -411,3 +411,12 @@ instead.

 This tunable is specific to i386 and x86-64.
 @end deftp
+
+@deftp Tunable glibc.cache.aarch64_dczva_threshold
+The @code{glibc.cache.aarch64_dczva_threshold} tunable allows the user to set
+threshold to trigger DC ZVA in memset on emag processor. When memset size
+is less than this threshold, normal memory store instruction will be used,
+otherwise DC ZVA instruction will be used.
+
+This tunable is specific to emag aarch64.
+@end deftp
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
index 5fac533..3bc622c 100644
--- a/sysdeps/aarch64/dl-tunables.list
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -22,4 +22,10 @@ glibc {
       type: STRING
     }
   }
+  cache {
+    aarch64_dczva_threshold {
+      type: SIZE_T
+      default: 0
+    }
+  }
 }
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 4817587..792d7f6 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -41,4 +41,6 @@ libc_ifunc (__libc_memset,

 # undef memset
 strong_alias (__libc_memset, memset);
+
+# include "./memset_emag.c"
 #endif
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..d523320 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -23,8 +23,8 @@
 # define MEMSET __memset_base64
 #endif

-#ifndef DC_ZVA_THRESHOLD
-# define DC_ZVA_THRESHOLD 512
+#ifndef DCZVA_THRESHOLD
+# define DCZVA_THRESHOLD 512
 #endif

 /* Assumptions:
@@ -91,7 +91,14 @@ L(set96):
        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
-       cmp     count, DC_ZVA_THRESHOLD
+#ifdef HAVE_DCZVA_THRESHOLD_TUNABLE
+       adrp    tmp1, __dczva_threshold
+       add     tmp1, tmp1, :lo12:__dczva_threshold
+       ldr     tmp2, [tmp1]    /* Load DC ZVA tunable threshold value. */
+       cmp     count, tmp2
+#else
+       cmp     count, DCZVA_THRESHOLD
+#endif
        ccmp    val, 0, 0, cs
        bic     dst, dstin, 15
        b.eq    L(zva_64)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..6df612b 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -20,13 +20,18 @@
 #if IS_IN (libc)
 # define MEMSET __memset_emag

+# if HAVE_TUNABLES
+#  define HAVE_DCZVA_THRESHOLD_TUNABLE 1
+# endif
+
 /*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we use a very
+ * large threshold to trigger usage of DC ZVA, which is good for
+ * multi-process/thread workloads.
+ */
+# define DCZVA_THRESHOLD 8*1024*1024

 # include "./memset_base64.S"
 #endif
diff --git a/sysdeps/aarch64/multiarch/memset_emag.c b/sysdeps/aarch64/multiarch/memset_emag.c
new file mode 100644
index 0000000..6121046
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_emag.c
@@ -0,0 +1,41 @@
+/* Setup threshold to trigger DC ZVA in memset for emag.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+
+/* We assume common workloads on server are likely to consist of multiple
+ * processes/threads, contending memory/cache. For this scenario, disabling
+ * DC ZVA in memset can archive better performance on emag processor.
+ * Therefore, by default, we use a very larget threshold, here is 8M,
+ * which has similar effect as disabling DC ZVA, kind of optimization
+ * for multi-process/thread workload.
+ */
+uint64_t __dczva_threshold = 8 * 1024 * 1024;
+
+static void
+__attribute__ ((constructor))
+init_dczva_threshold (void)
+{
+  uint64_t threshold
+    = TUNABLE_GET (glibc, cache, aarch64_dczva_threshold, uint64_t, NULL);
+
+  if (threshold)
+    __dczva_threshold = threshold;
+}
+#endif
--
1.8.3.1

________________________________________
From: Siddhesh Poyarekar <[hidden email]>
Sent: Friday, August 2, 2019 11:10:24 AM
To: Feng Xue OS; [hidden email]
Subject: Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

On 02/08/19 8:37 AM, Siddhesh Poyarekar wrote:
> On 02/08/19 7:19 AM, Feng Xue OS wrote:
>> For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
>> improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.
>
> That's great, can you test for another part too?  Making a case for a
> tunable is easier if you can show applicability to a larger set of
> processors.

Oops, I assumed this explanation was for introducing the tunable, I
realize now that it is for the default 8M value on emag.  The test is
still desirable to make the case for a tunable stronger, but not for
this specific point :)

Siddhesh

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Wilco Dijkstra-2
In reply to this post by Feng Xue OS
Hi Feng,

> I still hope this tuning on dc zva can work for other aarch64 processors.
> Since we focus on emag, and got no other aarch64 machines on hand,
> Then, if someone of other aarch64 is willing to test this, that would be better.

I don't believe this kind of tunable is useful in general. DC ZVA exists because
it gives a speedup - quite significantly so on the latest microarchitectures, but it
improves gcc_r performance as well on older cores like Cortex-A57.

If you find that it doesn't help emag, the best option is to avoid DC ZVA
altogether - this is even faster as you don't have to execute the runtime check.
Or you could use a tunable to select between fixed settings of the DC ZVA.

In fact it might be useful to have a generic tunable which allows one to choose
specific ifuncs, eg. glibc.memset=__memset_no_dczva.

        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
-       cmp     count, DC_ZVA_THRESHOLD
+#ifdef HAVE_DCZVA_THRESHOLD_TUNABLE
+       adrp    tmp1, __dczva_threshold
+       add     tmp1, tmp1, :lo12:__dczva_threshold
+       ldr     tmp2, [tmp1]    /* Load DC ZVA tunable threshold value. */
+       cmp     count, tmp2
+#else
+       cmp     count, DCZVA_THRESHOLD
+#endif

I don't think it makes sense to support both options here. The existing code
is carefully laid out so this undoes the 16-byte alignment of the following loops.

Wilco
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Siddhesh Poyarekar-8
On 06/08/19 9:47 PM, Wilco Dijkstra wrote:

> Hi Feng,
>
>> I still hope this tuning on dc zva can work for other aarch64 processors.
>> Since we focus on emag, and got no other aarch64 machines on hand,
>> Then, if someone of other aarch64 is willing to test this, that would be better.
>
> I don't believe this kind of tunable is useful in general. DC ZVA exists because
> it gives a speedup - quite significantly so on the latest microarchitectures, but it
> improves gcc_r performance as well on older cores like Cortex-A57.
>
> If you find that it doesn't help emag, the best option is to avoid DC ZVA
> altogether - this is even faster as you don't have to execute the runtime check.
> Or you could use a tunable to select between fixed settings of the DC ZVA.
>
> In fact it might be useful to have a generic tunable which allows one to choose
> specific ifuncs, eg. glibc.memset=__memset_no_dczva.

This is an interesting idea.  Although just for this specific case, it
might be sufficient to implement the glibc.cpu.hwcaps tunable from x86
and have "dczva" as a capability that can be turned on or off with + or -.

But your first suggestion is probably the easiest; drop dc zva
completely for ampere.

Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Feng Xue OS
This version disable DC ZVA in emag.

Feng
------
    * sysdeps/aarch64/multiarch/memset_base64.S (DC_ZVA_THRESHOLD):
    Disable DC ZVA code if this macro is defined as zero.
    * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD):
    Change to zero to disable using DC ZVA.
---
 ChangeLog                                 |  7 +++++++
 sysdeps/aarch64/multiarch/memset_base64.S | 12 ++++++++++--
 sysdeps/aarch64/multiarch/memset_emag.S   | 12 +++++++-----
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index dbdb85d..ba27f96 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2019-08-08  Feng Xue  <[hidden email]>
+
+       * sysdeps/aarch64/multiarch/memset_base64.S (DC_ZVA_THRESHOLD):
+       Disable DC ZVA code if this macro is defined as zero.
+       * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD):
+       Change to zero to disable using DC ZVA.
+
 2019-07-25  Florian Weimer  <[hidden email]>

        [BZ #24677]
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..c0cccba 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -23,6 +23,7 @@
 # define MEMSET __memset_base64
 #endif

+/* To disable DC ZVA, set this threshold to 0. */
 #ifndef DC_ZVA_THRESHOLD
 # define DC_ZVA_THRESHOLD 512
 #endif
@@ -91,11 +92,12 @@ L(set96):
        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
+       bic     dst, dstin, 15
+#if DC_ZVA_THRESHOLD
        cmp     count, DC_ZVA_THRESHOLD
        ccmp    val, 0, 0, cs
-       bic     dst, dstin, 15
        b.eq    L(zva_64)
-
+#endif
        /* Small-size or non-zero memset does not use DC ZVA. */
        sub     count, dstend, dst

@@ -105,7 +107,11 @@ L(set_long):
         * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
         */
        sub     count, count, 64+16+1
+
+#if DC_ZVA_THRESHOLD
+       /* Align loop on 16-byte boundary, this might be friendly to i-cache. */
        nop
+#endif

 1:     stp     val, val, [dst, 16]
        stp     val, val, [dst, 32]
@@ -121,6 +127,7 @@ L(set_long):
        stp     val, val, [dstend, -16]
        ret

+#if DC_ZVA_THRESHOLD
        .p2align 3
 L(zva_64):
        stp     val, val, [dst, 16]
@@ -173,6 +180,7 @@ L(zva_64):
 1:     stp     val, val, [dstend, -32]
        stp     val, val, [dstend, -16]
        ret
+#endif

 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..c2aed62 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -21,12 +21,14 @@
 # define MEMSET __memset_emag

 /*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we set threshold to
+ * zero to disable using DC ZVA, which is good for multi-process/thread
+ * workloads.
+ */
+
+# define DC_ZVA_THRESHOLD 0

 # include "./memset_base64.S"
 #endif
--
1.8.3.1

________________________________________
From: Siddhesh Poyarekar <[hidden email]>
Sent: Wednesday, August 7, 2019 10:12:48 PM
To: Wilco Dijkstra; 'GNU C Library'; Feng Xue OS
Cc: nd
Subject: Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

On 06/08/19 9:47 PM, Wilco Dijkstra wrote:

> Hi Feng,
>
>> I still hope this tuning on dc zva can work for other aarch64 processors.
>> Since we focus on emag, and got no other aarch64 machines on hand,
>> Then, if someone of other aarch64 is willing to test this, that would be better.
>
> I don't believe this kind of tunable is useful in general. DC ZVA exists because
> it gives a speedup - quite significantly so on the latest microarchitectures, but it
> improves gcc_r performance as well on older cores like Cortex-A57.
>
> If you find that it doesn't help emag, the best option is to avoid DC ZVA
> altogether - this is even faster as you don't have to execute the runtime check.
> Or you could use a tunable to select between fixed settings of the DC ZVA.
>
> In fact it might be useful to have a generic tunable which allows one to choose
> specific ifuncs, eg. glibc.memset=__memset_no_dczva.

This is an interesting idea.  Although just for this specific case, it
might be sufficient to implement the glibc.cpu.hwcaps tunable from x86
and have "dczva" as a capability that can be turned on or off with + or -.

But your first suggestion is probably the easiest; drop dc zva
completely for ampere.

Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold

Wilco Dijkstra-2
Hi Feng,

> This version disable DC ZVA in emag.

That looks good to me.

diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..c0cccba 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -23,6 +23,7 @@
 # define MEMSET __memset_base64
 #endif

+/* To disable DC ZVA, set this threshold to 0. */
 #ifndef DC_ZVA_THRESHOLD
 # define DC_ZVA_THRESHOLD 512
 #endif
@@ -91,11 +92,12 @@ L(set96):
        .p2align 4
 L(set_long):
        stp     val, val, [dstin]
+       bic     dst, dstin, 15
+#if DC_ZVA_THRESHOLD
        cmp     count, DC_ZVA_THRESHOLD
        ccmp    val, 0, 0, cs
-       bic     dst, dstin, 15
        b.eq    L(zva_64)
-
+#endif
        /* Small-size or non-zero memset does not use DC ZVA. */
        sub     count, dstend, dst

@@ -105,7 +107,11 @@ L(set_long):
         * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
         */
        sub     count, count, 64+16+1
+
+#if DC_ZVA_THRESHOLD
+       /* Align loop on 16-byte boundary, this might be friendly to i-cache. */
        nop
+#endif

 1:     stp     val, val, [dst, 16]
        stp     val, val, [dst, 32]
@@ -121,6 +127,7 @@ L(set_long):
        stp     val, val, [dstend, -16]
        ret

+#if DC_ZVA_THRESHOLD
        .p2align 3
 L(zva_64):
        stp     val, val, [dst, 16]
@@ -173,6 +180,7 @@ L(zva_64):
 1:     stp     val, val, [dstend, -32]
        stp     val, val, [dstend, -16]
        ret
+#endif

 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..c2aed62 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -21,12 +21,14 @@
 # define MEMSET __memset_emag

 /*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
  * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we set threshold to
+ * zero to disable using DC ZVA, which is good for multi-process/thread
+ * workloads.
+ */
+
+# define DC_ZVA_THRESHOLD 0

 # include "./memset_base64.S"
 #endif

OK

Wilco