[PATCH][AArch64] Cleanup memset

classic Classic list List threaded Threaded
8 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH][AArch64] Cleanup memset

Wilco Dijkstra-2
Cleanup memset.  Remove unnecessary code for unused ZVA sizes.
For zero memsets it's faster use DC ZVA for >= 160 bytes rather
than 256. Add a define which allows skipping the ZVA size test when
the ZVA size is known to be 64 - reading dczid_el0 may be expensive.
This simplifies the Falkor memset implementation.

--
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
index 872a6b511de2bb97939e644c78676a6212c65d6d..22da7381e68bdfaa6086e39cc980c77150242d21 100644
--- a/sysdeps/aarch64/memset-reg.h
+++ b/sysdeps/aarch64/memset-reg.h
@@ -22,9 +22,4 @@
 #define count x2
 #define dst x3
 #define dstend x4
-#define tmp1 x5
-#define tmp1w w5
-#define tmp2 x6
-#define tmp2w w6
-#define zva_len x7
-#define zva_lenw w7
+#define zva_val x5
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index ac577f1660e2706e2024e42d0efc09120c041af2..4654732e1bdd4a7e8f673b79ed66be408bed726b 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -78,114 +78,49 @@ L(set96):
  stp q0, q0, [dstend, -32]
  ret
 
- .p2align 3
- nop
+ .p2align 4
 L(set_long):
  and valw, valw, 255
  bic dst, dstin, 15
  str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large.  */
- sub dst, dst, 16 /* Dst is biased by -32.  */
- sub count, count, 64 + 16 /* Adjust count and bias for loop.  */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
-L(try_zva):
-#ifdef ZVA_MACRO
- zva_macro
-#else
- .p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes.  */
- b.ne L(zva_128)
-
- /* Write the first and last 64 byte aligned block using stp rather
-   than using DC ZVA.  This is faster on some cores.
- */
-L(zva_64):
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes.  */
+ b.ne L(no_zva)
+#endif
  str q0, [dst, 16]
  stp q0, q0, [dst, 32]
  bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop.  */
- add dst, dst, 128
- nop
-1: dc zva, dst
+ sub count, dstend, dst /* Count is now 64 too large.  */
+ sub count, count, 128 /* Adjust count and bias for loop.  */
+
+ .p2align 4
+L(zva_loop):
  add dst, dst, 64
+ dc zva, dst
  subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(zva_loop)
  stp q0, q0, [dstend, -64]
  stp q0, q0, [dstend, -32]
  ret
 
- .p2align 3
-L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes.  */
- b.ne L(zva_other)
-
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large.  */
+ sub dst, dst, 16 /* Dst is biased by -32.  */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop.  */
+L(no_zva_loop):
  stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop.  */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
  stp q0, q0, [dstend, -64]
  stp q0, q0, [dstend, -32]
  ret
 
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write.  */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address.  */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write.  */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop.  */
- b L(tail64)
-#endif
-
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
index 54fd5abffb1b6638ef8a5fc29e58b2f67765b28a..bee4aed52aab69f5aaded367d40e8b64e406c545 100644
--- a/sysdeps/aarch64/multiarch/memset_falkor.S
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -24,30 +24,8 @@
    use this function only when ZVA is enabled.  */
 
 #if IS_IN (libc)
-.macro zva_macro
- .p2align 4
- /* Write the first and last 64 byte aligned block using stp rather
-   than using DC ZVA.  This is faster on some cores.  */
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop.  */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 64
- subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-.endm
-
-# define ZVA_MACRO zva_macro
+
+# define SKIP_ZVA_CHECK
 # define MEMSET __memset_falkor
 # include <sysdeps/aarch64/memset.S>
 #endif


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Siddhesh Poyarekar-8
On 26/02/20 21:41, Wilco Dijkstra wrote:
> Cleanup memset.  Remove unnecessary code for unused ZVA sizes.
> For zero memsets it's faster use DC ZVA for >= 160 bytes rather
> than 256. Add a define which allows skipping the ZVA size test when
> the ZVA size is known to be 64 - reading dczid_el0 may be expensive.
> This simplifies the Falkor memset implementation.

This looks OK in general, although can you please elaborate on the
following:

- What cores did you test on to conclude that 160 is a better threshold
than 256?

- Is the intention to support non-64-byte zva sizes once there is actual
hardware that implements it and not bother with it for now?  I agree
with the idea if that's the case, just that it would be nice to have
that documented in the git commit message.

Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Wilco Dijkstra-2
Hi Siddhesh,

> This looks OK in general, although can you please elaborate on the
> following:
>
> - What cores did you test on to conclude that 160 is a better threshold
> than 256?

I've mostly done testing on Neoverse N1, Cortex-A72 and Cortex-A53.
(the latter seems to be always faster with DC ZVA disabled, so the threshold
doesn't really matter). I wrote a random memset benchmark similar to the
memcpy one, and performance is unchanged there given there is no change
in the way small cases are handled.

> - Is the intention to support non-64-byte zva sizes once there is actual
> hardware that implements it and not bother with it for now?  I agree
> with the idea if that's the case, just that it would be nice to have
> that documented in the git commit message.

Yes, otherwise it's hard to test or prove it helps performance after all. We've
had issues with the non-64 ZVA sizes before, so it's best to keep it simple.

I'm also trying to reduce the amount of code and avoid unnecessary proliferation
of almost identical ifuncs. I think we can remove most of the memset ifuncs,
it seems we need one version without ZVA and a ZVA version for size 64.

Cheers,
Wilco
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Siddhesh Poyarekar-8
On 03/03/20 20:12, Wilco Dijkstra wrote:

> Hi Siddhesh,
>
>> This looks OK in general, although can you please elaborate on the
>> following:
>>
>> - What cores did you test on to conclude that 160 is a better threshold
>> than 256?
>
> I've mostly done testing on Neoverse N1, Cortex-A72 and Cortex-A53.
> (the latter seems to be always faster with DC ZVA disabled, so the threshold
> doesn't really matter). I wrote a random memset benchmark similar to the
> memcpy one, and performance is unchanged there given there is no change
> in the way small cases are handled.

OK, please mention this in the commit log.

>> - Is the intention to support non-64-byte zva sizes once there is actual
>> hardware that implements it and not bother with it for now?  I agree
>> with the idea if that's the case, just that it would be nice to have
>> that documented in the git commit message.
>
> Yes, otherwise it's hard to test or prove it helps performance after all. We've
> had issues with the non-64 ZVA sizes before, so it's best to keep it simple.
>
> I'm also trying to reduce the amount of code and avoid unnecessary proliferation
> of almost identical ifuncs. I think we can remove most of the memset ifuncs,
> it seems we need one version without ZVA and a ZVA version for size 64.

Sounds like a plan.

Thanks,
Siddhesh
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Andrew Pinski-3
In reply to this post by Wilco Dijkstra-2
On Tue, Mar 3, 2020 at 6:42 AM Wilco Dijkstra <[hidden email]> wrote:

>
> Hi Siddhesh,
>
> > This looks OK in general, although can you please elaborate on the
> > following:
> >
> > - What cores did you test on to conclude that 160 is a better threshold
> > than 256?
>
> I've mostly done testing on Neoverse N1, Cortex-A72 and Cortex-A53.
> (the latter seems to be always faster with DC ZVA disabled, so the threshold
> doesn't really matter). I wrote a random memset benchmark similar to the
> memcpy one, and performance is unchanged there given there is no change
> in the way small cases are handled.
>
> > - Is the intention to support non-64-byte zva sizes once there is actual
> > hardware that implements it and not bother with it for now?  I agree
> > with the idea if that's the case, just that it would be nice to have
> > that documented in the git commit message.
>
> Yes, otherwise it's hard to test or prove it helps performance after all. We've
> had issues with the non-64 ZVA sizes before, so it's best to keep it simple.
>
> I'm also trying to reduce the amount of code and avoid unnecessary proliferation
> of almost identical ifuncs. I think we can remove most of the memset ifuncs,
> it seems we need one version without ZVA and a ZVA version for size 64.

I am trying to understand what was the decision here.  The main reason
is OcteonTX2 does ZVA 128 which was faster than doing one without
(OcteonTX1 is similar but has an errata which causes ZVA to be turned
off).

Thanks,
Andrew Pinski

>
> Cheers,
> Wilco
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Wilco Dijkstra-2
Hi Andrew,

>> Yes, otherwise it's hard to test or prove it helps performance after all. We've
>> had issues with the non-64 ZVA sizes before, so it's best to keep it simple.
>>
>> I'm also trying to reduce the amount of code and avoid unnecessary proliferation
>> of almost identical ifuncs. I think we can remove most of the memset ifuncs,
>> it seems we need one version without ZVA and a ZVA version for size 64.
>
> I am trying to understand what was the decision here.  The main reason
> is OcteonTX2 does ZVA 128 which was faster than doing one without
> (OcteonTX1 is similar but has an errata which causes ZVA to be turned
> off).

So you're saying there may soon be a new microarchitecture which supports
ZVA size 128? Is there a significant speedup? Is querying the ZVA size fast?

Cheers,
Wilco

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Sourceware - libc-alpha mailing list
On Thu, Mar 12, 2020 at 8:46 AM Wilco Dijkstra <[hidden email]> wrote:

>
> Hi Andrew,
>
> >> Yes, otherwise it's hard to test or prove it helps performance after all. We've
> >> had issues with the non-64 ZVA sizes before, so it's best to keep it simple.
> >>
> >> I'm also trying to reduce the amount of code and avoid unnecessary proliferation
> >> of almost identical ifuncs. I think we can remove most of the memset ifuncs,
> >> it seems we need one version without ZVA and a ZVA version for size 64.
> >
> > I am trying to understand what was the decision here.  The main reason
> > is OcteonTX2 does ZVA 128 which was faster than doing one without
> > (OcteonTX1 is similar but has an errata which causes ZVA to be turned
> > off).
>
> So you're saying there may soon be a new microarchitecture which supports
> ZVA size 128? Is there a significant speedup? Is querying the ZVA size fast?

By new you mean just publicly announced within a month that supports
ZVA size of 128; OcteonTX 2 annoucment:
https://www.marvell.com/company/newsroom/marvell-announces-octeon-tx2-family-of-multi-core-infrastructure-processors.html
?
Yes there is significant speedup for using "DC ZVA" as the latency is
one cycle compared to the multiple (8) cycles needed to fill the full
128byte cache line using stp.
Yes querying ZVA size fast, only a 1 cycles latency.

Thanks,
Andrew


>
> Cheers,
> Wilco
>
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH][AArch64] Cleanup memset

Siddhesh Poyarekar-8
On 12/03/20 22:36, Andrew Pinski wrote:
> By new you mean just publicly announced within a month that supports
> ZVA size of 128; OcteonTX 2 annoucment:
> https://www.marvell.com/company/newsroom/marvell-announces-octeon-tx2-family-of-multi-core-infrastructure-processors.html
> ?
> Yes there is significant speedup for using "DC ZVA" as the latency is
> one cycle compared to the multiple (8) cycles needed to fill the full
> 128byte cache line using stp.
> Yes querying ZVA size fast, only a 1 cycles latency.

Wouldn't a zva128 be an improvement over zva_other?  Perhaps even move
towards per-zva size ifuncs that gets rid of the need to do a size check
inside memset.  That way, memset_falkor simply becomes memset_zva64.
IIRC the reason why that suggestion was resisted (that was my initial
approach for the falkor memset) was that it didn't give any significant
improvement on other architectures.  That's probably not a strong enough
reason now that I think of it, given that there will be a reasonable
limit to the number of ifuncs that get spawned off here.

IIRC Richard had proposed a similar patch for multiple zva sizes some
years ago too, not sure what happened to that series.

Siddhesh