[PATCH v2 0/3] x86: Add support for Zhaoxin processors

classic Classic list List threaded Threaded
15 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH v2 0/3] x86: Add support for Zhaoxin processors

MayShao
This patch series fix Shanghai Zhaoxin processor CPU Vendor ID detection
problem in glibc sysdep module. Current glibc doesn't recognize Zhaoxin
CPU Vendor ID("CentaurHauls" and "Shanghai") and set kind to
arch_kind_other. These lead to incorrect result of __cache_sysconf(),
incorrect value for variables like __x86_shared_cache_size, and fail
of test case tst-get-cpu-features.

Previous version: https://sourceware.org/pipermail/libc-alpha/2019-December/109170.html
More disscussion: https://sourceware.org/pipermail/libc-alpha/2019-December/109227.html

Changes from previous version:
    - Remove the bit_arch_Prefer_MAP_32BIT_EXEC flag on the Zhaoxin processor
      with family==0x6.

This series was tested on x86_64-linux-gnu.

MayShao (3):
  x86: Add CPU Vendor ID detection support for Zhaoxin processors
  x86: Add cache information support for Zhaoxin processors
  x86: Add the test cse of __get_cpu_features support for Zhaoxin
    processors

 sysdeps/x86/cacheinfo.c            | 185 +++++++++++++++++++++++++++++++++++++
 sysdeps/x86/cpu-features.c         |  58 ++++++++++++
 sysdeps/x86/cpu-features.h         |   1 +
 sysdeps/x86/tst-get-cpu-features.c |   2 +
 4 files changed, 246 insertions(+)

--
2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

[PATCH v2 1/3] x86: Add CPU Vendor ID detection support for Zhaoxin processors

MayShao
To recognize Zhaoxin CPU Vendor ID, add a new architecture type
arch_kind_zhaoxin for Vendor Zhaoxin detection.

---
 sysdeps/x86/cpu-features.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 sysdeps/x86/cpu-features.h |  1 +
 2 files changed, 59 insertions(+)

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 81a170a..4d60553 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -466,6 +466,64 @@ init_cpu_features (struct cpu_features *cpu_features)
          }
        }
     }
+  /* This spells out "CentaurHauls" or " Shanghai ".  */
+  else if ((ebx == 0x746e6543 && ecx == 0x736c7561 && edx == 0x48727561)
+          || (ebx == 0x68532020 && ecx == 0x20206961 && edx == 0x68676e61))
+    {
+      unsigned int extended_model, stepping;
+
+      kind = arch_kind_zhaoxin;
+
+      get_common_indices (cpu_features, &family, &model, &extended_model,
+                         &stepping);
+
+      get_extended_indices (cpu_features);
+
+
+      if (family == 0x6)
+        {
+          model += extended_model;
+          if (model == 0xf || model == 0x19)
+            {
+              cpu_features->feature[index_arch_AVX_Usable]
+                &= (~bit_arch_AVX_Usable
+                & ~bit_arch_AVX2_Usable);
+
+              cpu_features->feature[index_arch_Slow_SSE4_2]
+                |= (bit_arch_Slow_SSE4_2);
+
+              cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
+                &= ~bit_arch_AVX_Fast_Unaligned_Load;
+            }
+        }
+
+      if (family == 0x7)
+        {
+          model += extended_model;
+          if (model == 0x1b)
+            {
+              cpu_features->feature[index_arch_AVX_Usable]
+                &= (~bit_arch_AVX_Usable
+                & ~bit_arch_AVX2_Usable);
+
+              cpu_features->feature[index_arch_Slow_SSE4_2]
+                |= bit_arch_Slow_SSE4_2;
+
+              cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
+                &= ~bit_arch_AVX_Fast_Unaligned_Load;
+           }
+
+         if (model == 0x3b)
+           {
+             cpu_features->feature[index_arch_AVX_Usable]
+               &= (~bit_arch_AVX_Usable
+               & ~bit_arch_AVX2_Usable);
+
+               cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
+               &= ~bit_arch_AVX_Fast_Unaligned_Load;
+           }
+       }
+    }
   else
     {
       kind = arch_kind_other;
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index aea83e6..f05d5ce 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -53,6 +53,7 @@ enum cpu_features_kind
   arch_kind_unknown = 0,
   arch_kind_intel,
   arch_kind_amd,
+  arch_kind_zhaoxin,
   arch_kind_other
 };

--
2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

[PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

MayShao
In reply to this post by MayShao
To obtain Zhaoxin CPU cache information, add a new function
handle_zhaoxin().

Add Zhaoxin branch in init_cacheinfo() for initializing variables,
such as __x86_shared_cache_size.

---
 sysdeps/x86/cacheinfo.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)

diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index e3e8ef2..e5a3284 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -436,6 +436,57 @@ handle_amd (int name)
 }


+static long int __attribute__ ((noinline))
+handle_zhaoxin (int name)
+{
+  unsigned int eax;
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+
+  int folded_rel_name = (M(name) / 3) * 3;
+
+  unsigned int round = 0;
+  while (1)
+    {
+      __cpuid_count (4, round, eax, ebx, ecx, edx);
+
+      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
+      if (type == null)
+        break;
+
+      unsigned int level = (eax >> 5) & 0x7;
+
+      if ((level == 1 && type == data
+        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
+        || (level == 1 && type == inst
+            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
+        || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
+        || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
+        {
+          unsigned int offset = M(name) - folded_rel_name;
+
+          if (offset == 0)
+            /* Cache size.  */
+            return (((ebx >> 22) + 1)
+                * (((ebx >> 12) & 0x3ff) + 1)
+                * ((ebx & 0xfff) + 1)
+                * (ecx + 1));
+          if (offset == 1)
+            return (ebx >> 22) + 1;
+
+          assert (offset == 2);
+          return (ebx & 0xfff) + 1;
+        }
+
+      ++round;
+    }
+
+  /* Nothing found.  */
+  return 0;
+}
+
+
 /* Get the value of the system variable NAME.  */
 long int
 attribute_hidden
@@ -449,6 +500,9 @@ __cache_sysconf (int name)
   if (cpu_features->basic.kind == arch_kind_amd)
     return handle_amd (name);

+  if (cpu_features->basic.kind == arch_kind_zhaoxin)
+    return handle_zhaoxin (name);
+
   // XXX Fill in more vendors.

   /* CPU not known, we have no information.  */
@@ -751,6 +805,137 @@ intel_bug_no_cache_info:
        }
 #endif
     }
+  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+    {
+      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+
+      /* Number of logical processors sharing L2 cache.  */
+      int threads_l2;
+
+      /* Number of logical processors sharing L3 cache.  */
+      int threads_l3;
+
+      if (shared <= 0)
+        {
+          /* No shared L3 cache.  All we have is the L2 cache.  */
+          level = 2;
+          shared = core;
+          threads_l2 = 0;
+          threads_l3 = -1;
+        }
+      else
+        {
+          level = 3;
+          threads_l2 = 0;
+          threads_l3 = 0;
+        }
+
+      int i = 0;
+
+      /* Query until cache level 2 and 3 are enumerated.  */
+      int check = 0x1 | (threads_l3 == 0) << 1;
+      do
+        {
+          __cpuid_count (4, i++, eax, ebx, ecx, edx);
+
+          switch ((eax >> 5) & 0x7)
+            {
+            default:
+              break;
+            case 2:
+              if ((check & 0x1))
+                {
+                  /* Get maximum number of logical processors
+                     sharing L2 cache.  */
+                  threads_l2 = (eax >> 14) & 0x3ff;
+                  check &= ~0x1;
+                }
+              break;
+            case 3:
+              if ((check & (0x1 << 1)))
+               {
+                  /* Get maximum number of logical processors
+                     sharing L3 cache.  */
+                  threads_l3 = (eax >> 14) & 0x3ff;
+                  check &= ~(0x1 << 1);
+                }
+              break;
+           }
+        }
+      while (check);
+
+      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
+         numbers of addressable IDs for logical processors sharing
+         the cache, instead of the maximum number of threads
+         sharing the cache.  */
+      if (max_cpuid >= 11)
+        {
+          /* Find the number of logical processors shipped in
+             one core and apply count mask.  */
+          i = 0;
+
+          /* Count SMT only if there is L3 cache.  Always count
+             core if there is no L3 cache.  */
+          int count = ((threads_l2 > 0 && level == 3)
+                       | ((threads_l3 > 0
+                           || (threads_l2 > 0 && level == 2)) << 1));
+
+          while (count)
+            {
+              __cpuid_count (11, i++, eax, ebx, ecx, edx);
+
+              int shipped = ebx & 0xff;
+              int type = ecx & 0xff00;
+              if (shipped == 0 || type == 0)
+                break;
+              else if (type == 0x100)
+                {
+                  /* Count SMT.  */
+                  if ((count & 0x1))
+                    {
+                      int count_mask;
+
+                      /* Compute count mask.  */
+                      asm ("bsr %1, %0"
+                           : "=r" (count_mask) : "g" (threads_l2));
+                      count_mask = ~(-1 << (count_mask + 1));
+                      threads_l2 = (shipped - 1) & count_mask;
+                      count &= ~0x1;
+                    }
+                }
+              else if (type == 0x200)
+                {
+                  /* Count core.  */
+                  if ((count & (0x1 << 1)))
+                    {
+                      int count_mask;
+                      int threads_core
+                        = (level == 2 ? threads_l2 : threads_l3);
+
+                      /* Compute count mask.  */
+                      asm ("bsr %1, %0"
+                           : "=r" (count_mask) : "g" (threads_core));
+                      count_mask = ~(-1 << (count_mask + 1));
+                      threads_core = (shipped - 1) & count_mask;
+                      if (level == 2)
+                        threads_l2 = threads_core;
+                      else
+                        threads_l3 = threads_core;
+                      count &= ~(0x1 << 1);
+                    }
+                }
+            }
+        }
+      if (level == 2 && threads_l2 > 0)
+        threads = threads_l2 + 1;
+      if (level == 3 && threads_l3 > 0)
+        threads = threads_l3 + 1;
+
+      if (shared > 0 && threads > 0)
+        shared /= threads;
+    }

   if (cpu_features->data_cache_size != 0)
     data = cpu_features->data_cache_size;
--
2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

[PATCH v2 3/3] x86: Add the test case of __get_cpu_features support for Zhaoxin processors

MayShao
In reply to this post by MayShao
For the test case of the __get_cpu_features interface, add an item in
cpu_kinds and a switch case for Zhaoxin support.

---
 sysdeps/x86/tst-get-cpu-features.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
index 0f55987..0dcb906 100644
--- a/sysdeps/x86/tst-get-cpu-features.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -38,6 +38,7 @@ static const char * const cpu_kinds[] =
   "Unknown",
   "Intel",
   "AMD",
+  "ZHAOXIN",
   "Other",
 };

@@ -50,6 +51,7 @@ do_test (void)
     {
     case arch_kind_intel:
     case arch_kind_amd:
+    case arch_kind_zhaoxin:
     case arch_kind_other:
       printf ("Vendor: %s\n", cpu_kinds[cpu_features->basic.kind]);
       printf ("Family: 0x%x\n", cpu_features->basic.family);
--
2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 0/3] x86: Add support for Zhaoxin processors

MayShao
In reply to this post by MayShao
Ping.

>
> This patch series fix Shanghai Zhaoxin processor CPU Vendor ID detection
> problem in glibc sysdep module. Current glibc doesn't recognize Zhaoxin CPU
> Vendor ID("CentaurHauls" and "Shanghai") and set kind to arch_kind_other.
> These lead to incorrect result of __cache_sysconf(), incorrect value for
> variables like __x86_shared_cache_size, and fail of test case
> tst-get-cpu-features.
>
> Previous version:
> https://sourceware.org/pipermail/libc-alpha/2019-December/109170.html
> More disscussion:
> https://sourceware.org/pipermail/libc-alpha/2019-December/109227.html
>
> Changes from previous version:
>     - Remove the bit_arch_Prefer_MAP_32BIT_EXEC flag on the Zhaoxin
> processor
>       with family==0x6.
>
> This series was tested on x86_64-linux-gnu.
>
> MayShao (3):
>   x86: Add CPU Vendor ID detection support for Zhaoxin processors
>   x86: Add cache information support for Zhaoxin processors
>   x86: Add the test cse of __get_cpu_features support for Zhaoxin
>     processors
>
>  sysdeps/x86/cacheinfo.c            | 185
> +++++++++++++++++++++++++++++++++++++
>  sysdeps/x86/cpu-features.c         |  58 ++++++++++++
>  sysdeps/x86/cpu-features.h         |   1 +
>  sysdeps/x86/tst-get-cpu-features.c |   2 +
>  4 files changed, 246 insertions(+)
>
> --
> 2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 1/3] x86: Add CPU Vendor ID detection support for Zhaoxin processors

Sourceware - libc-alpha mailing list
In reply to this post by MayShao
On Sun, Mar 29, 2020 at 10:34 PM MayShao <[hidden email]> wrote:

>
> To recognize Zhaoxin CPU Vendor ID, add a new architecture type
> arch_kind_zhaoxin for Vendor Zhaoxin detection.
>
> ---
>  sysdeps/x86/cpu-features.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>  sysdeps/x86/cpu-features.h |  1 +
>  2 files changed, 59 insertions(+)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 81a170a..4d60553 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -466,6 +466,64 @@ init_cpu_features (struct cpu_features *cpu_features)
>           }
>         }
>      }
> +  /* This spells out "CentaurHauls" or " Shanghai ".  */
> +  else if ((ebx == 0x746e6543 && ecx == 0x736c7561 && edx == 0x48727561)
> +          || (ebx == 0x68532020 && ecx == 0x20206961 && edx == 0x68676e61))
> +    {
> +      unsigned int extended_model, stepping;
> +
> +      kind = arch_kind_zhaoxin;
> +
> +      get_common_indices (cpu_features, &family, &model, &extended_model,
> +                         &stepping);
> +
> +      get_extended_indices (cpu_features);
> +
> +

Single blank line.

> +      if (family == 0x6)
> +        {
> +          model += extended_model;

Move it out of if block.

> +          if (model == 0xf || model == 0x19)
> +            {
> +              cpu_features->feature[index_arch_AVX_Usable]
> +                &= (~bit_arch_AVX_Usable
> +                & ~bit_arch_AVX2_Usable);
> +
> +              cpu_features->feature[index_arch_Slow_SSE4_2]
> +                |= (bit_arch_Slow_SSE4_2);
> +
> +              cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> +                &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +            }
> +        }
> +
> +      if (family == 0x7)

else if

> +        {
> +          model += extended_model;

Remove it.

> +          if (model == 0x1b)
> +            {
> +              cpu_features->feature[index_arch_AVX_Usable]
> +                &= (~bit_arch_AVX_Usable
> +                & ~bit_arch_AVX2_Usable);
> +
> +              cpu_features->feature[index_arch_Slow_SSE4_2]
> +                |= bit_arch_Slow_SSE4_2;
> +
> +              cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> +                &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +           }
> +
> +         if (model == 0x3b)

else if

> +           {
> +             cpu_features->feature[index_arch_AVX_Usable]
> +               &= (~bit_arch_AVX_Usable
> +               & ~bit_arch_AVX2_Usable);
> +
> +               cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> +               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> +           }
> +       }
> +    }
>    else
>      {
>        kind = arch_kind_other;
> diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
> index aea83e6..f05d5ce 100644
> --- a/sysdeps/x86/cpu-features.h
> +++ b/sysdeps/x86/cpu-features.h
> @@ -53,6 +53,7 @@ enum cpu_features_kind
>    arch_kind_unknown = 0,
>    arch_kind_intel,
>    arch_kind_amd,
> +  arch_kind_zhaoxin,
>    arch_kind_other
>  };
>
> --
> 2.7.4
>
>


--
H.J.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 3/3] x86: Add the test case of __get_cpu_features support for Zhaoxin processors

Sourceware - libc-alpha mailing list
In reply to this post by MayShao
On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:

>
> For the test case of the __get_cpu_features interface, add an item in
> cpu_kinds and a switch case for Zhaoxin support.
>
> ---
>  sysdeps/x86/tst-get-cpu-features.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
> index 0f55987..0dcb906 100644
> --- a/sysdeps/x86/tst-get-cpu-features.c
> +++ b/sysdeps/x86/tst-get-cpu-features.c
> @@ -38,6 +38,7 @@ static const char * const cpu_kinds[] =
>    "Unknown",
>    "Intel",
>    "AMD",
> +  "ZHAOXIN",
>    "Other",
>  };
>
> @@ -50,6 +51,7 @@ do_test (void)
>      {
>      case arch_kind_intel:
>      case arch_kind_amd:
> +    case arch_kind_zhaoxin:
>      case arch_kind_other:
>        printf ("Vendor: %s\n", cpu_kinds[cpu_features->basic.kind]);
>        printf ("Family: 0x%x\n", cpu_features->basic.family);
> --
> 2.7.4

LGTM.

BTW, have you finished your paperwork with FSF?

--
H.J.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

Sourceware - libc-alpha mailing list
In reply to this post by MayShao
On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:

>
> To obtain Zhaoxin CPU cache information, add a new function
> handle_zhaoxin().
>
> Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> such as __x86_shared_cache_size.
>
> ---
>  sysdeps/x86/cacheinfo.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 185 insertions(+)
>
> diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> index e3e8ef2..e5a3284 100644
> --- a/sysdeps/x86/cacheinfo.c
> +++ b/sysdeps/x86/cacheinfo.c
> @@ -436,6 +436,57 @@ handle_amd (int name)
>  }
>
>
> +static long int __attribute__ ((noinline))
> +handle_zhaoxin (int name)
> +{
> +  unsigned int eax;
> +  unsigned int ebx;
> +  unsigned int ecx;
> +  unsigned int edx;
> +
> +  int folded_rel_name = (M(name) / 3) * 3;
> +
> +  unsigned int round = 0;
> +  while (1)
> +    {
> +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> +
> +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> +      if (type == null)
> +        break;
> +
> +      unsigned int level = (eax >> 5) & 0x7;
> +
> +      if ((level == 1 && type == data
> +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> +        || (level == 1 && type == inst
> +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> +        || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
> +        || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
> +        {
> +          unsigned int offset = M(name) - folded_rel_name;
> +
> +          if (offset == 0)
> +            /* Cache size.  */
> +            return (((ebx >> 22) + 1)
> +                * (((ebx >> 12) & 0x3ff) + 1)
> +                * ((ebx & 0xfff) + 1)
> +                * (ecx + 1));
> +          if (offset == 1)
> +            return (ebx >> 22) + 1;
> +
> +          assert (offset == 2);
> +          return (ebx & 0xfff) + 1;
> +        }
> +
> +      ++round;
> +    }
> +
> +  /* Nothing found.  */
> +  return 0;
> +}
> +
> +
>  /* Get the value of the system variable NAME.  */
>  long int
>  attribute_hidden
> @@ -449,6 +500,9 @@ __cache_sysconf (int name)
>    if (cpu_features->basic.kind == arch_kind_amd)
>      return handle_amd (name);
>
> +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> +    return handle_zhaoxin (name);
> +
>    // XXX Fill in more vendors.
>
>    /* CPU not known, we have no information.  */
> @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
>         }
>  #endif
>      }
> +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> +    {
> +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> +
> +      /* Number of logical processors sharing L2 cache.  */
> +      int threads_l2;
> +
> +      /* Number of logical processors sharing L3 cache.  */
> +      int threads_l3;
> +
> +      if (shared <= 0)
> +        {
> +          /* No shared L3 cache.  All we have is the L2 cache.  */
> +          level = 2;
> +          shared = core;
> +          threads_l2 = 0;
> +          threads_l3 = -1;
> +        }
> +      else
> +        {
> +          level = 3;
> +          threads_l2 = 0;
> +          threads_l3 = 0;
> +        }
> +
> +      int i = 0;
> +
> +      /* Query until cache level 2 and 3 are enumerated.  */
> +      int check = 0x1 | (threads_l3 == 0) << 1;
> +      do
> +        {
> +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> +
> +          switch ((eax >> 5) & 0x7)
> +            {
> +            default:
> +              break;
> +            case 2:
> +              if ((check & 0x1))
> +                {
> +                  /* Get maximum number of logical processors
> +                     sharing L2 cache.  */
> +                  threads_l2 = (eax >> 14) & 0x3ff;
> +                  check &= ~0x1;
> +                }
> +              break;
> +            case 3:
> +              if ((check & (0x1 << 1)))
> +               {
> +                  /* Get maximum number of logical processors
> +                     sharing L3 cache.  */
> +                  threads_l3 = (eax >> 14) & 0x3ff;
> +                  check &= ~(0x1 << 1);
> +                }
> +              break;
> +           }
> +        }
> +      while (check);
> +
> +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> +         numbers of addressable IDs for logical processors sharing
> +         the cache, instead of the maximum number of threads
> +         sharing the cache.  */
> +      if (max_cpuid >= 11)
> +        {
> +          /* Find the number of logical processors shipped in
> +             one core and apply count mask.  */
> +          i = 0;
> +
> +          /* Count SMT only if there is L3 cache.  Always count
> +             core if there is no L3 cache.  */
> +          int count = ((threads_l2 > 0 && level == 3)
> +                       | ((threads_l3 > 0
> +                           || (threads_l2 > 0 && level == 2)) << 1));
> +
> +          while (count)
> +            {
> +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> +
> +              int shipped = ebx & 0xff;
> +              int type = ecx & 0xff00;
> +              if (shipped == 0 || type == 0)
> +                break;
> +              else if (type == 0x100)
> +                {
> +                  /* Count SMT.  */
> +                  if ((count & 0x1))
> +                    {
> +                      int count_mask;
> +
> +                      /* Compute count mask.  */
> +                      asm ("bsr %1, %0"
> +                           : "=r" (count_mask) : "g" (threads_l2));
> +                      count_mask = ~(-1 << (count_mask + 1));
> +                      threads_l2 = (shipped - 1) & count_mask;
> +                      count &= ~0x1;
> +                    }
> +                }
> +              else if (type == 0x200)
> +                {
> +                  /* Count core.  */
> +                  if ((count & (0x1 << 1)))
> +                    {
> +                      int count_mask;
> +                      int threads_core
> +                        = (level == 2 ? threads_l2 : threads_l3);
> +
> +                      /* Compute count mask.  */
> +                      asm ("bsr %1, %0"
> +                           : "=r" (count_mask) : "g" (threads_core));
> +                      count_mask = ~(-1 << (count_mask + 1));
> +                      threads_core = (shipped - 1) & count_mask;
> +                      if (level == 2)
> +                        threads_l2 = threads_core;
> +                      else
> +                        threads_l3 = threads_core;
> +                      count &= ~(0x1 << 1);
> +                    }
> +                }
> +            }
> +        }
> +      if (level == 2 && threads_l2 > 0)
> +        threads = threads_l2 + 1;
> +      if (level == 3 && threads_l3 > 0)
> +        threads = threads_l3 + 1;
> +
> +      if (shared > 0 && threads > 0)
> +        shared /= threads;
> +    }

This code looks very similar to Intel code.   Can you factor it out and reuse
it for you?

>    if (cpu_features->data_cache_size != 0)
>      data = cpu_features->data_cache_size;

--
H.J.
Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 3/3] x86: Add the test case of __get_cpu_features support for Zhaoxin processors

MayShao
In reply to this post by Sourceware - libc-alpha mailing list


On Tue, April 7, 2020 at 8:38 PM H.J. Lu <[hidden email]> wrote:

>
> On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:
> >
> > For the test case of the __get_cpu_features interface, add an item in
> > cpu_kinds and a switch case for Zhaoxin support.
> >
> > ---
> >  sysdeps/x86/tst-get-cpu-features.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/sysdeps/x86/tst-get-cpu-features.c
> b/sysdeps/x86/tst-get-cpu-features.c
> > index 0f55987..0dcb906 100644
> > --- a/sysdeps/x86/tst-get-cpu-features.c
> > +++ b/sysdeps/x86/tst-get-cpu-features.c
> > @@ -38,6 +38,7 @@ static const char * const cpu_kinds[] =
> >    "Unknown",
> >    "Intel",
> >    "AMD",
> > +  "ZHAOXIN",
> >    "Other",
> >  };
> >
> > @@ -50,6 +51,7 @@ do_test (void)
> >      {
> >      case arch_kind_intel:
> >      case arch_kind_amd:
> > +    case arch_kind_zhaoxin:
> >      case arch_kind_other:
> >        printf ("Vendor: %s\n", cpu_kinds[cpu_features->basic.kind]);
> >        printf ("Family: 0x%x\n", cpu_features->basic.family);
> > --
> > 2.7.4
>
> LGTM.
>
> BTW, have you finished your paperwork with FSF?

Yes, I have finished the paperwork with FSF.  If there is any problem,
please let me know.

Thanks for the review.

Best Regards,
May Shao

> --
> H.J.


保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 3/3] x86: Add the test case of __get_cpu_features support for Zhaoxin processors

Sourceware - libc-alpha mailing list
On 4/7/20 10:36 PM, May Shao(BJ-RD) wrote:

>
>
> On Tue, April 7, 2020 at 8:38 PM H.J. Lu <[hidden email]> wrote:
>>
>> On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:
>>>
>>> For the test case of the __get_cpu_features interface, add an item in
>>> cpu_kinds and a switch case for Zhaoxin support.
>>>
>>> ---
>>>  sysdeps/x86/tst-get-cpu-features.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/sysdeps/x86/tst-get-cpu-features.c
>> b/sysdeps/x86/tst-get-cpu-features.c
>>> index 0f55987..0dcb906 100644
>>> --- a/sysdeps/x86/tst-get-cpu-features.c
>>> +++ b/sysdeps/x86/tst-get-cpu-features.c
>>> @@ -38,6 +38,7 @@ static const char * const cpu_kinds[] =
>>>    "Unknown",
>>>    "Intel",
>>>    "AMD",
>>> +  "ZHAOXIN",
>>>    "Other",
>>>  };
>>>
>>> @@ -50,6 +51,7 @@ do_test (void)
>>>      {
>>>      case arch_kind_intel:
>>>      case arch_kind_amd:
>>> +    case arch_kind_zhaoxin:
>>>      case arch_kind_other:
>>>        printf ("Vendor: %s\n", cpu_kinds[cpu_features->basic.kind]);
>>>        printf ("Family: 0x%x\n", cpu_features->basic.family);
>>> --
>>> 2.7.4
>>
>> LGTM.
>>
>> BTW, have you finished your paperwork with FSF?
>
> Yes, I have finished the paperwork with FSF.  If there is any problem,
> please let me know.

I confirm that we can accept patches only from May Shao from Zhaoxin.

--
Cheers,
Carlos.

Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

Mayshao-oc
In reply to this post by Sourceware - libc-alpha mailing list

On Tue, April 7, 2020 at 8:44 PM H.J. Lu <[hidden email]> wrote:

>
> On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:
> >
> > To obtain Zhaoxin CPU cache information, add a new function
> > handle_zhaoxin().
> >
> > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > such as __x86_shared_cache_size.
> >
> > ---
> >  sysdeps/x86/cacheinfo.c | 185
> ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 185 insertions(+)
> >
> > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > index e3e8ef2..e5a3284 100644
> > --- a/sysdeps/x86/cacheinfo.c
> > +++ b/sysdeps/x86/cacheinfo.c
> > @@ -436,6 +436,57 @@ handle_amd (int name)
> >  }
> >
> >
> > +static long int __attribute__ ((noinline))
> > +handle_zhaoxin (int name)
> > +{
> > +  unsigned int eax;
> > +  unsigned int ebx;
> > +  unsigned int ecx;
> > +  unsigned int edx;
> > +
> > +  int folded_rel_name = (M(name) / 3) * 3;
> > +
> > +  unsigned int round = 0;
> > +  while (1)
> > +    {
> > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > +
> > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > +      if (type == null)
> > +        break;
> > +
> > +      unsigned int level = (eax >> 5) & 0x7;
> > +
> > +      if ((level == 1 && type == data
> > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > +        || (level == 1 && type == inst
> > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > +        || (level == 2 && folded_rel_name ==
> M(_SC_LEVEL2_CACHE_SIZE))
> > +        || (level == 3 && folded_rel_name ==
> M(_SC_LEVEL3_CACHE_SIZE)))
> > +        {
> > +          unsigned int offset = M(name) - folded_rel_name;
> > +
> > +          if (offset == 0)
> > +            /* Cache size.  */
> > +            return (((ebx >> 22) + 1)
> > +                * (((ebx >> 12) & 0x3ff) + 1)
> > +                * ((ebx & 0xfff) + 1)
> > +                * (ecx + 1));
> > +          if (offset == 1)
> > +            return (ebx >> 22) + 1;
> > +
> > +          assert (offset == 2);
> > +          return (ebx & 0xfff) + 1;
> > +        }
> > +
> > +      ++round;
> > +    }
> > +
> > +  /* Nothing found.  */
> > +  return 0;
> > +}
> > +
> > +
> >  /* Get the value of the system variable NAME.  */
> >  long int
> >  attribute_hidden
> > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> >    if (cpu_features->basic.kind == arch_kind_amd)
> >      return handle_amd (name);
> >
> > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > +    return handle_zhaoxin (name);
> > +
> >    // XXX Fill in more vendors.
> >
> >    /* CPU not known, we have no information.  */
> > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> >         }
> >  #endif
> >      }
> > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > +    {
> > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > +
> > +      /* Number of logical processors sharing L2 cache.  */
> > +      int threads_l2;
> > +
> > +      /* Number of logical processors sharing L3 cache.  */
> > +      int threads_l3;
> > +
> > +      if (shared <= 0)
> > +        {
> > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > +          level = 2;
> > +          shared = core;
> > +          threads_l2 = 0;
> > +          threads_l3 = -1;
> > +        }
> > +      else
> > +        {
> > +          level = 3;
> > +          threads_l2 = 0;
> > +          threads_l3 = 0;
> > +        }
> > +
> > +      int i = 0;
> > +
> > +      /* Query until cache level 2 and 3 are enumerated.  */
> > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > +      do
> > +        {
> > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > +
> > +          switch ((eax >> 5) & 0x7)
> > +            {
> > +            default:
> > +              break;
> > +            case 2:
> > +              if ((check & 0x1))
> > +                {
> > +                  /* Get maximum number of logical processors
> > +                     sharing L2 cache.  */
> > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > +                  check &= ~0x1;
> > +                }
> > +              break;
> > +            case 3:
> > +              if ((check & (0x1 << 1)))
> > +               {
> > +                  /* Get maximum number of logical processors
> > +                     sharing L3 cache.  */
> > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > +                  check &= ~(0x1 << 1);
> > +                }
> > +              break;
> > +           }
> > +        }
> > +      while (check);
> > +
> > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> > +         numbers of addressable IDs for logical processors sharing
> > +         the cache, instead of the maximum number of threads
> > +         sharing the cache.  */
> > +      if (max_cpuid >= 11)
> > +        {
> > +          /* Find the number of logical processors shipped in
> > +             one core and apply count mask.  */
> > +          i = 0;
> > +
> > +          /* Count SMT only if there is L3 cache.  Always count
> > +             core if there is no L3 cache.  */
> > +          int count = ((threads_l2 > 0 && level == 3)
> > +                       | ((threads_l3 > 0
> > +                           || (threads_l2 > 0 && level == 2)) << 1));
> > +
> > +          while (count)
> > +            {
> > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > +
> > +              int shipped = ebx & 0xff;
> > +              int type = ecx & 0xff00;
> > +              if (shipped == 0 || type == 0)
> > +                break;
> > +              else if (type == 0x100)
> > +                {
> > +                  /* Count SMT.  */
> > +                  if ((count & 0x1))
> > +                    {
> > +                      int count_mask;
> > +
> > +                      /* Compute count mask.  */
> > +                      asm ("bsr %1, %0"
> > +                           : "=r" (count_mask) : "g" (threads_l2));
> > +                      count_mask = ~(-1 << (count_mask + 1));
> > +                      threads_l2 = (shipped - 1) & count_mask;
> > +                      count &= ~0x1;
> > +                    }
> > +                }
> > +              else if (type == 0x200)
> > +                {
> > +                  /* Count core.  */
> > +                  if ((count & (0x1 << 1)))
> > +                    {
> > +                      int count_mask;
> > +                      int threads_core
> > +                        = (level == 2 ? threads_l2 : threads_l3);
> > +
> > +                      /* Compute count mask.  */
> > +                      asm ("bsr %1, %0"
> > +                           : "=r" (count_mask) : "g" (threads_core));
> > +                      count_mask = ~(-1 << (count_mask + 1));
> > +                      threads_core = (shipped - 1) & count_mask;
> > +                      if (level == 2)
> > +                        threads_l2 = threads_core;
> > +                      else
> > +                        threads_l3 = threads_core;
> > +                      count &= ~(0x1 << 1);
> > +                    }
> > +                }
> > +            }
> > +        }
> > +      if (level == 2 && threads_l2 > 0)
> > +        threads = threads_l2 + 1;
> > +      if (level == 3 && threads_l3 > 0)
> > +        threads = threads_l3 + 1;
> > +
> > +      if (shared > 0 && threads > 0)
> > +        shared /= threads;
> > +    }
>
> This code looks very similar to Intel code.   Can you factor it out and reuse
> it for you?

I tried to extract this part of the code, but it didn’t look very clean.
For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
Zhaoxin processors are currently inclusive caches, and the number of threads
sharing L2 cache is not affected by the family or model.  Considering the possible
changes of CPU design in future, it may be more convenient to keep separate branches.

I was wandering if you had any concerns, or if you could give some suggestions,
that would be great.


Best Regards,
May Shao

Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 1/3] x86: Add CPU Vendor ID detection support for Zhaoxin processors

Mayshao-oc
In reply to this post by Sourceware - libc-alpha mailing list

On Tue, Apr 7, 2020 at 8:38 PM H.J. Lu <[hidden email]> wrote:

>
> On Sun, Mar 29, 2020 at 10:34 PM MayShao <[hidden email]> wrote:
> >
> > To recognize Zhaoxin CPU Vendor ID, add a new architecture type
> > arch_kind_zhaoxin for Vendor Zhaoxin detection.
> >
> > ---
> >  sysdeps/x86/cpu-features.c | 58
> ++++++++++++++++++++++++++++++++++++++++++++++
> >  sysdeps/x86/cpu-features.h |  1 +
> >  2 files changed, 59 insertions(+)
> >
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 81a170a..4d60553 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -466,6 +466,64 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >           }
> >         }
> >      }
> > +  /* This spells out "CentaurHauls" or " Shanghai ".  */
> > +  else if ((ebx == 0x746e6543 && ecx == 0x736c7561 && edx ==
> 0x48727561)
> > +          || (ebx == 0x68532020 && ecx == 0x20206961 && edx ==
> 0x68676e61))
> > +    {
> > +      unsigned int extended_model, stepping;
> > +
> > +      kind = arch_kind_zhaoxin;
> > +
> > +      get_common_indices (cpu_features, &family, &model,
> &extended_model,
> > +                         &stepping);
> > +
> > +      get_extended_indices (cpu_features);
> > +
> > +
>
> Single blank line.
>
> > +      if (family == 0x6)
> > +        {
> > +          model += extended_model;
>
> Move it out of if block.
>
> > +          if (model == 0xf || model == 0x19)
> > +            {
> > +              cpu_features->feature[index_arch_AVX_Usable]
> > +                &= (~bit_arch_AVX_Usable
> > +                & ~bit_arch_AVX2_Usable);
> > +
> > +              cpu_features->feature[index_arch_Slow_SSE4_2]
> > +                |= (bit_arch_Slow_SSE4_2);
> > +
> > +
> cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> > +                &= ~bit_arch_AVX_Fast_Unaligned_Load;
> > +            }
> > +        }
> > +
> > +      if (family == 0x7)
>
> else if
>
> > +        {
> > +          model += extended_model;
>
> Remove it.
>
> > +          if (model == 0x1b)
> > +            {
> > +              cpu_features->feature[index_arch_AVX_Usable]
> > +                &= (~bit_arch_AVX_Usable
> > +                & ~bit_arch_AVX2_Usable);
> > +
> > +              cpu_features->feature[index_arch_Slow_SSE4_2]
> > +                |= bit_arch_Slow_SSE4_2;
> > +
> > +
> cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> > +                &= ~bit_arch_AVX_Fast_Unaligned_Load;
> > +           }
> > +
> > +         if (model == 0x3b)
>
> else if
>
> > +           {
> > +             cpu_features->feature[index_arch_AVX_Usable]
> > +               &= (~bit_arch_AVX_Usable
> > +               & ~bit_arch_AVX2_Usable);
> > +
> > +
> cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
> > +               &= ~bit_arch_AVX_Fast_Unaligned_Load;
> > +           }
> > +       }
> > +    }
> >    else
> >      {
> >        kind = arch_kind_other;
> > diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
> > index aea83e6..f05d5ce 100644
> > --- a/sysdeps/x86/cpu-features.h
> > +++ b/sysdeps/x86/cpu-features.h
> > @@ -53,6 +53,7 @@ enum cpu_features_kind
> >    arch_kind_unknown = 0,
> >    arch_kind_intel,
> >    arch_kind_amd,
> > +  arch_kind_zhaoxin,
> >    arch_kind_other
> >  };
> >
> > --
> > 2.7.4
> >
> >
Thank you for your comments, I have fixed the place you mentioned
and attached it to this email.


Best Regards,
May Shao


0001-x86-Add-CPU-Vendor-ID-detection-support-for-Zhaoxin-.patch (3K) Download Attachment
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

Sourceware - libc-alpha mailing list
In reply to this post by Mayshao-oc
On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <[hidden email]> wrote:

>
>
> On Tue, April 7, 2020 at 8:44 PM H.J. Lu <[hidden email]> wrote:
> >
> > On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]> wrote:
> > >
> > > To obtain Zhaoxin CPU cache information, add a new function
> > > handle_zhaoxin().
> > >
> > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > such as __x86_shared_cache_size.
> > >
> > > ---
> > >  sysdeps/x86/cacheinfo.c | 185
> > ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 185 insertions(+)
> > >
> > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > index e3e8ef2..e5a3284 100644
> > > --- a/sysdeps/x86/cacheinfo.c
> > > +++ b/sysdeps/x86/cacheinfo.c
> > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > >  }
> > >
> > >
> > > +static long int __attribute__ ((noinline))
> > > +handle_zhaoxin (int name)
> > > +{
> > > +  unsigned int eax;
> > > +  unsigned int ebx;
> > > +  unsigned int ecx;
> > > +  unsigned int edx;
> > > +
> > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > +
> > > +  unsigned int round = 0;
> > > +  while (1)
> > > +    {
> > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > +
> > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > +      if (type == null)
> > > +        break;
> > > +
> > > +      unsigned int level = (eax >> 5) & 0x7;
> > > +
> > > +      if ((level == 1 && type == data
> > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > +        || (level == 1 && type == inst
> > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > +        || (level == 2 && folded_rel_name ==
> > M(_SC_LEVEL2_CACHE_SIZE))
> > > +        || (level == 3 && folded_rel_name ==
> > M(_SC_LEVEL3_CACHE_SIZE)))
> > > +        {
> > > +          unsigned int offset = M(name) - folded_rel_name;
> > > +
> > > +          if (offset == 0)
> > > +            /* Cache size.  */
> > > +            return (((ebx >> 22) + 1)
> > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > +                * ((ebx & 0xfff) + 1)
> > > +                * (ecx + 1));
> > > +          if (offset == 1)
> > > +            return (ebx >> 22) + 1;
> > > +
> > > +          assert (offset == 2);
> > > +          return (ebx & 0xfff) + 1;
> > > +        }
> > > +
> > > +      ++round;
> > > +    }
> > > +
> > > +  /* Nothing found.  */
> > > +  return 0;
> > > +}
> > > +
> > > +
> > >  /* Get the value of the system variable NAME.  */
> > >  long int
> > >  attribute_hidden
> > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > >    if (cpu_features->basic.kind == arch_kind_amd)
> > >      return handle_amd (name);
> > >
> > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > +    return handle_zhaoxin (name);
> > > +
> > >    // XXX Fill in more vendors.
> > >
> > >    /* CPU not known, we have no information.  */
> > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > >         }
> > >  #endif
> > >      }
> > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > +    {
> > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > +
> > > +      /* Number of logical processors sharing L2 cache.  */
> > > +      int threads_l2;
> > > +
> > > +      /* Number of logical processors sharing L3 cache.  */
> > > +      int threads_l3;
> > > +
> > > +      if (shared <= 0)
> > > +        {
> > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > +          level = 2;
> > > +          shared = core;
> > > +          threads_l2 = 0;
> > > +          threads_l3 = -1;
> > > +        }
> > > +      else
> > > +        {
> > > +          level = 3;
> > > +          threads_l2 = 0;
> > > +          threads_l3 = 0;
> > > +        }
> > > +
> > > +      int i = 0;
> > > +
> > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > +      do
> > > +        {
> > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > +
> > > +          switch ((eax >> 5) & 0x7)
> > > +            {
> > > +            default:
> > > +              break;
> > > +            case 2:
> > > +              if ((check & 0x1))
> > > +                {
> > > +                  /* Get maximum number of logical processors
> > > +                     sharing L2 cache.  */
> > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > +                  check &= ~0x1;
> > > +                }
> > > +              break;
> > > +            case 3:
> > > +              if ((check & (0x1 << 1)))
> > > +               {
> > > +                  /* Get maximum number of logical processors
> > > +                     sharing L3 cache.  */
> > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > +                  check &= ~(0x1 << 1);
> > > +                }
> > > +              break;
> > > +           }
> > > +        }
> > > +      while (check);
> > > +
> > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> > > +         numbers of addressable IDs for logical processors sharing
> > > +         the cache, instead of the maximum number of threads
> > > +         sharing the cache.  */
> > > +      if (max_cpuid >= 11)
> > > +        {
> > > +          /* Find the number of logical processors shipped in
> > > +             one core and apply count mask.  */
> > > +          i = 0;
> > > +
> > > +          /* Count SMT only if there is L3 cache.  Always count
> > > +             core if there is no L3 cache.  */
> > > +          int count = ((threads_l2 > 0 && level == 3)
> > > +                       | ((threads_l3 > 0
> > > +                           || (threads_l2 > 0 && level == 2)) << 1));
> > > +
> > > +          while (count)
> > > +            {
> > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > +
> > > +              int shipped = ebx & 0xff;
> > > +              int type = ecx & 0xff00;
> > > +              if (shipped == 0 || type == 0)
> > > +                break;
> > > +              else if (type == 0x100)
> > > +                {
> > > +                  /* Count SMT.  */
> > > +                  if ((count & 0x1))
> > > +                    {
> > > +                      int count_mask;
> > > +
> > > +                      /* Compute count mask.  */
> > > +                      asm ("bsr %1, %0"
> > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > +                      count &= ~0x1;
> > > +                    }
> > > +                }
> > > +              else if (type == 0x200)
> > > +                {
> > > +                  /* Count core.  */
> > > +                  if ((count & (0x1 << 1)))
> > > +                    {
> > > +                      int count_mask;
> > > +                      int threads_core
> > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > +
> > > +                      /* Compute count mask.  */
> > > +                      asm ("bsr %1, %0"
> > > +                           : "=r" (count_mask) : "g" (threads_core));
> > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > +                      threads_core = (shipped - 1) & count_mask;
> > > +                      if (level == 2)
> > > +                        threads_l2 = threads_core;
> > > +                      else
> > > +                        threads_l3 = threads_core;
> > > +                      count &= ~(0x1 << 1);
> > > +                    }
> > > +                }
> > > +            }
> > > +        }
> > > +      if (level == 2 && threads_l2 > 0)
> > > +        threads = threads_l2 + 1;
> > > +      if (level == 3 && threads_l3 > 0)
> > > +        threads = threads_l3 + 1;
> > > +
> > > +      if (shared > 0 && threads > 0)
> > > +        shared /= threads;
> > > +    }
> >
> > This code looks very similar to Intel code.   Can you factor it out and reuse
> > it for you?
>
> I tried to extract this part of the code, but it didn’t look very clean.
> For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.

The same as new Intel processors.

> Zhaoxin processors are currently inclusive caches, and the number of threads

Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

> sharing L2 cache is not affected by the family or model.  Considering the possible

You can check cpu_features->basic.kind for that.

> changes of CPU design in future, it may be more convenient to keep separate branches.
>
> I was wandering if you had any concerns, or if you could give some suggestions,
> that would be great.

Let's avoid code duplication for now and revisit it in the future.

--
H.J.
Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

Mayshao-oc

On Fri, Apr 10, 2020 at 7:53 PM H.J. Lu <[hidden email]> wrote:

>
> On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <[hidden email]>
> wrote:
> >
> >
> > On Tue, April 7, 2020 at 8:44 PM H.J. Lu <[hidden email]> wrote:
> > >
> > > On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]>
> wrote:
> > > >
> > > > To obtain Zhaoxin CPU cache information, add a new function
> > > > handle_zhaoxin().
> > > >
> > > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > > such as __x86_shared_cache_size.
> > > >
> > > > ---
> > > >  sysdeps/x86/cacheinfo.c | 185
> > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 185 insertions(+)
> > > >
> > > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > > index e3e8ef2..e5a3284 100644
> > > > --- a/sysdeps/x86/cacheinfo.c
> > > > +++ b/sysdeps/x86/cacheinfo.c
> > > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > > >  }
> > > >
> > > >
> > > > +static long int __attribute__ ((noinline))
> > > > +handle_zhaoxin (int name)
> > > > +{
> > > > +  unsigned int eax;
> > > > +  unsigned int ebx;
> > > > +  unsigned int ecx;
> > > > +  unsigned int edx;
> > > > +
> > > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > > +
> > > > +  unsigned int round = 0;
> > > > +  while (1)
> > > > +    {
> > > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > > +
> > > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > > +      if (type == null)
> > > > +        break;
> > > > +
> > > > +      unsigned int level = (eax >> 5) & 0x7;
> > > > +
> > > > +      if ((level == 1 && type == data
> > > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > > +        || (level == 1 && type == inst
> > > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > > +        || (level == 2 && folded_rel_name ==
> > > M(_SC_LEVEL2_CACHE_SIZE))
> > > > +        || (level == 3 && folded_rel_name ==
> > > M(_SC_LEVEL3_CACHE_SIZE)))
> > > > +        {
> > > > +          unsigned int offset = M(name) - folded_rel_name;
> > > > +
> > > > +          if (offset == 0)
> > > > +            /* Cache size.  */
> > > > +            return (((ebx >> 22) + 1)
> > > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > > +                * ((ebx & 0xfff) + 1)
> > > > +                * (ecx + 1));
> > > > +          if (offset == 1)
> > > > +            return (ebx >> 22) + 1;
> > > > +
> > > > +          assert (offset == 2);
> > > > +          return (ebx & 0xfff) + 1;
> > > > +        }
> > > > +
> > > > +      ++round;
> > > > +    }
> > > > +
> > > > +  /* Nothing found.  */
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +
> > > >  /* Get the value of the system variable NAME.  */
> > > >  long int
> > > >  attribute_hidden
> > > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > > >    if (cpu_features->basic.kind == arch_kind_amd)
> > > >      return handle_amd (name);
> > > >
> > > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    return handle_zhaoxin (name);
> > > > +
> > > >    // XXX Fill in more vendors.
> > > >
> > > >    /* CPU not known, we have no information.  */
> > > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > > >         }
> > > >  #endif
> > > >      }
> > > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    {
> > > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > > +
> > > > +      /* Number of logical processors sharing L2 cache.  */
> > > > +      int threads_l2;
> > > > +
> > > > +      /* Number of logical processors sharing L3 cache.  */
> > > > +      int threads_l3;
> > > > +
> > > > +      if (shared <= 0)
> > > > +        {
> > > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > > +          level = 2;
> > > > +          shared = core;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = -1;
> > > > +        }
> > > > +      else
> > > > +        {
> > > > +          level = 3;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = 0;
> > > > +        }
> > > > +
> > > > +      int i = 0;
> > > > +
> > > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > > +      do
> > > > +        {
> > > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +          switch ((eax >> 5) & 0x7)
> > > > +            {
> > > > +            default:
> > > > +              break;
> > > > +            case 2:
> > > > +              if ((check & 0x1))
> > > > +                {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L2 cache.  */
> > > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~0x1;
> > > > +                }
> > > > +              break;
> > > > +            case 3:
> > > > +              if ((check & (0x1 << 1)))
> > > > +               {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L3 cache.  */
> > > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~(0x1 << 1);
> > > > +                }
> > > > +              break;
> > > > +           }
> > > > +        }
> > > > +      while (check);
> > > > +
> > > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the
> maximum
> > > > +         numbers of addressable IDs for logical processors sharing
> > > > +         the cache, instead of the maximum number of threads
> > > > +         sharing the cache.  */
> > > > +      if (max_cpuid >= 11)
> > > > +        {
> > > > +          /* Find the number of logical processors shipped in
> > > > +             one core and apply count mask.  */
> > > > +          i = 0;
> > > > +
> > > > +          /* Count SMT only if there is L3 cache.  Always count
> > > > +             core if there is no L3 cache.  */
> > > > +          int count = ((threads_l2 > 0 && level == 3)
> > > > +                       | ((threads_l3 > 0
> > > > +                           || (threads_l2 > 0 && level == 2)) <<
> 1));
> > > > +
> > > > +          while (count)
> > > > +            {
> > > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +              int shipped = ebx & 0xff;
> > > > +              int type = ecx & 0xff00;
> > > > +              if (shipped == 0 || type == 0)
> > > > +                break;
> > > > +              else if (type == 0x100)
> > > > +                {
> > > > +                  /* Count SMT.  */
> > > > +                  if ((count & 0x1))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > > +                      count &= ~0x1;
> > > > +                    }
> > > > +                }
> > > > +              else if (type == 0x200)
> > > > +                {
> > > > +                  /* Count core.  */
> > > > +                  if ((count & (0x1 << 1)))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +                      int threads_core
> > > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g"
> (threads_core));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_core = (shipped - 1) & count_mask;
> > > > +                      if (level == 2)
> > > > +                        threads_l2 = threads_core;
> > > > +                      else
> > > > +                        threads_l3 = threads_core;
> > > > +                      count &= ~(0x1 << 1);
> > > > +                    }
> > > > +                }
> > > > +            }
> > > > +        }
> > > > +      if (level == 2 && threads_l2 > 0)
> > > > +        threads = threads_l2 + 1;
> > > > +      if (level == 3 && threads_l3 > 0)
> > > > +        threads = threads_l3 + 1;
> > > > +
> > > > +      if (shared > 0 && threads > 0)
> > > > +        shared /= threads;
> > > > +    }
> > >
> > > This code looks very similar to Intel code.   Can you factor it out and reuse
> > > it for you?
> >
> > I tried to extract this part of the code, but it didn’t look very clean.
> > For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
>
> The same as new Intel processors.
>
> > Zhaoxin processors are currently inclusive caches, and the number of threads
>
> Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

Yes, Zhaoxin use CPUID bit to indicate it.

> > sharing L2 cache is not affected by the family or model.  Considering the
> possible
>
> You can check cpu_features->basic.kind for that.
>
> > changes of CPU design in future, it may be more convenient to keep separate
> branches.
> >
> > I was wandering if you had any concerns, or if you could give some
> suggestions,
> > that would be great.
>
> Let's avoid code duplication for now and revisit it in the future.

You’re right.  I will try as you suggest and send it as patch v3.

Thanks for your time.


Best Regards,
May Shao

Reply | Threaded
Open this post in threaded view
|

RE: [PATCH v2 2/3] x86: Add cache information support for Zhaoxin processors

Mayshao-oc
In reply to this post by Sourceware - libc-alpha mailing list

On Fri, Apr 10, 2020 at 7:53 PM H.J. Lu <[hidden email]> wrote:

>
> On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <[hidden email]>
> wrote:
> >
> >
> > On Tue, April 7, 2020 at 8:44 PM H.J. Lu <[hidden email]> wrote:
> > >
> > > On Sun, Mar 29, 2020 at 10:35 PM MayShao <[hidden email]>
> wrote:
> > > >
> > > > To obtain Zhaoxin CPU cache information, add a new function
> > > > handle_zhaoxin().
> > > >
> > > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > > such as __x86_shared_cache_size.
> > > >
> > > > ---
> > > >  sysdeps/x86/cacheinfo.c | 185
> > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 185 insertions(+)
> > > >
> > > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > > index e3e8ef2..e5a3284 100644
> > > > --- a/sysdeps/x86/cacheinfo.c
> > > > +++ b/sysdeps/x86/cacheinfo.c
> > > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > > >  }
> > > >
> > > >
> > > > +static long int __attribute__ ((noinline))
> > > > +handle_zhaoxin (int name)
> > > > +{
> > > > +  unsigned int eax;
> > > > +  unsigned int ebx;
> > > > +  unsigned int ecx;
> > > > +  unsigned int edx;
> > > > +
> > > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > > +
> > > > +  unsigned int round = 0;
> > > > +  while (1)
> > > > +    {
> > > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > > +
> > > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > > +      if (type == null)
> > > > +        break;
> > > > +
> > > > +      unsigned int level = (eax >> 5) & 0x7;
> > > > +
> > > > +      if ((level == 1 && type == data
> > > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > > +        || (level == 1 && type == inst
> > > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > > +        || (level == 2 && folded_rel_name ==
> > > M(_SC_LEVEL2_CACHE_SIZE))
> > > > +        || (level == 3 && folded_rel_name ==
> > > M(_SC_LEVEL3_CACHE_SIZE)))
> > > > +        {
> > > > +          unsigned int offset = M(name) - folded_rel_name;
> > > > +
> > > > +          if (offset == 0)
> > > > +            /* Cache size.  */
> > > > +            return (((ebx >> 22) + 1)
> > > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > > +                * ((ebx & 0xfff) + 1)
> > > > +                * (ecx + 1));
> > > > +          if (offset == 1)
> > > > +            return (ebx >> 22) + 1;
> > > > +
> > > > +          assert (offset == 2);
> > > > +          return (ebx & 0xfff) + 1;
> > > > +        }
> > > > +
> > > > +      ++round;
> > > > +    }
> > > > +
> > > > +  /* Nothing found.  */
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +
> > > >  /* Get the value of the system variable NAME.  */
> > > >  long int
> > > >  attribute_hidden
> > > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > > >    if (cpu_features->basic.kind == arch_kind_amd)
> > > >      return handle_amd (name);
> > > >
> > > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    return handle_zhaoxin (name);
> > > > +
> > > >    // XXX Fill in more vendors.
> > > >
> > > >    /* CPU not known, we have no information.  */
> > > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > > >         }
> > > >  #endif
> > > >      }
> > > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    {
> > > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > > +
> > > > +      /* Number of logical processors sharing L2 cache.  */
> > > > +      int threads_l2;
> > > > +
> > > > +      /* Number of logical processors sharing L3 cache.  */
> > > > +      int threads_l3;
> > > > +
> > > > +      if (shared <= 0)
> > > > +        {
> > > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > > +          level = 2;
> > > > +          shared = core;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = -1;
> > > > +        }
> > > > +      else
> > > > +        {
> > > > +          level = 3;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = 0;
> > > > +        }
> > > > +
> > > > +      int i = 0;
> > > > +
> > > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > > +      do
> > > > +        {
> > > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +          switch ((eax >> 5) & 0x7)
> > > > +            {
> > > > +            default:
> > > > +              break;
> > > > +            case 2:
> > > > +              if ((check & 0x1))
> > > > +                {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L2 cache.  */
> > > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~0x1;
> > > > +                }
> > > > +              break;
> > > > +            case 3:
> > > > +              if ((check & (0x1 << 1)))
> > > > +               {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L3 cache.  */
> > > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~(0x1 << 1);
> > > > +                }
> > > > +              break;
> > > > +           }
> > > > +        }
> > > > +      while (check);
> > > > +
> > > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the
> maximum
> > > > +         numbers of addressable IDs for logical processors sharing
> > > > +         the cache, instead of the maximum number of threads
> > > > +         sharing the cache.  */
> > > > +      if (max_cpuid >= 11)
> > > > +        {
> > > > +          /* Find the number of logical processors shipped in
> > > > +             one core and apply count mask.  */
> > > > +          i = 0;
> > > > +
> > > > +          /* Count SMT only if there is L3 cache.  Always count
> > > > +             core if there is no L3 cache.  */
> > > > +          int count = ((threads_l2 > 0 && level == 3)
> > > > +                       | ((threads_l3 > 0
> > > > +                           || (threads_l2 > 0 && level == 2)) <<
> 1));
> > > > +
> > > > +          while (count)
> > > > +            {
> > > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +              int shipped = ebx & 0xff;
> > > > +              int type = ecx & 0xff00;
> > > > +              if (shipped == 0 || type == 0)
> > > > +                break;
> > > > +              else if (type == 0x100)
> > > > +                {
> > > > +                  /* Count SMT.  */
> > > > +                  if ((count & 0x1))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > > +                      count &= ~0x1;
> > > > +                    }
> > > > +                }
> > > > +              else if (type == 0x200)
> > > > +                {
> > > > +                  /* Count core.  */
> > > > +                  if ((count & (0x1 << 1)))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +                      int threads_core
> > > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g"
> (threads_core));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_core = (shipped - 1) & count_mask;
> > > > +                      if (level == 2)
> > > > +                        threads_l2 = threads_core;
> > > > +                      else
> > > > +                        threads_l3 = threads_core;
> > > > +                      count &= ~(0x1 << 1);
> > > > +                    }
> > > > +                }
> > > > +            }
> > > > +        }
> > > > +      if (level == 2 && threads_l2 > 0)
> > > > +        threads = threads_l2 + 1;
> > > > +      if (level == 3 && threads_l3 > 0)
> > > > +        threads = threads_l3 + 1;
> > > > +
> > > > +      if (shared > 0 && threads > 0)
> > > > +        shared /= threads;
> > > > +    }
> > >
> > > This code looks very similar to Intel code.   Can you factor it out and reuse
> > > it for you?
> >
> > I tried to extract this part of the code, but it didn’t look very clean.
> > For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
>
> The same as new Intel processors.
>
> > Zhaoxin processors are currently inclusive caches, and the number of threads
>
> Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

Yes, Zhaoxin use CPUID bit to indicate it.

> > sharing L2 cache is not affected by the family or model.  Considering the
> possible
>
> You can check cpu_features->basic.kind for that.
>
> > changes of CPU design in future, it may be more convenient to keep separate
> branches.
> >
> > I was wandering if you had any concerns, or if you could give some
> suggestions,
> > that would be great.
>
> Let's avoid code duplication for now and revisit it in the future.
>
You are right.  I will try as you suggest and send it as patch v3.

Thank you for your comment.

Best Regards,
May Shao