[PATCH] PPC64 performance improvements for sqrt

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] PPC64 performance improvements for sqrt

Steven Munroe
For POWER4 and later, the performance of sqrt()/sqrtf() is limited by
the overhead of stacking a frame, addressing the GOT, testing for NaN
etc, which is only need in  case we need to report EDOM via
__kernel_standard(). For PowerPC the fsqrt[s] instruction does all the
work except for setting errno to EDOM for non-zero negative values.

So the attached patch simulates prologue shrink-wrapping such that the
fsqrt calculation  and basic error checking occurs before the prologure.
If there are no errors the result is returned directly. Otherwise stack
stack a frame and reports errors if needed via __kernel_standard().

This patch only impacts builds using:

    --with-cpu=[power4,970,power5,power5+,power6,power6x]

the code for power4/970 is slightly different from that for power5 and
later due to increased latency accessing the FPSCR. For micro benchmarks
I see the following improvement:

    1.74 X on ppc970 2.0GHz
    1.71 X on power4 1.0GHz
    2.60 X on power5 1.9GHz
    1.55 X on power6 4.2GHz

This patch will also be release with powerpc-cpu-V0.07.



2007-10-26  Steven Munroe  <[hidden email]>

        * sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c (__sqrt): Make __sqrt
        a leaf routine with tail call to __w_sqrt. Error path code moved to.
        (__w_sqrt): Here
        * sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c (__sqrtf): Make
        __sqrtf a leaf routine with tail call to __w_sqrtf. Error path code
        moved to.
        (__w_sqrtf): Here
        * sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c: New File.
        * sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c: New File.
        * sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c (__sqrt): Make __sqrt
        a leaf routine with tail call to __w_sqrt. Error path code moved to.
        (__w_sqrt): Here
        * sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c (__sqrtf): Make
        __sqrtf a leaf routine with tail call to __w_sqrtf. Error path code
        moved to.
        (__w_sqrtf): Here
        * sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c: New File.
        * sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c: New File.

diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c 2007-06-03 15:51:32.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.c 2007-10-26 11:14:06.881065568 -0500
@@ -32,11 +32,47 @@
 #endif
 {
   double z;
-/* Power4 (ISA V2.0) and above implement sqrt in hardware.  */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrt instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.  Extented checking is
+   only needed to set errno (via __kernel_standard) if the input value
+   is negative.
+  
+   The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+   to indicated that the input value was negative or NaN. Use Move to
+   Condition Register from FPSCR to copy the FPCC field to cr1.  The
+   branch on summary overflow transfers control to __w_sqrt to process
+   any error conditions. Otherwise we can return the result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrt as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
    __asm __volatile (
- " fsqrt %0,%1\n"
+ " fsqrt 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
  : "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
 #ifdef _IEEE_LIBM
   return z;
 #else
@@ -45,7 +81,7 @@
     
   if (__builtin_expect (x != x, 0))
     return z;
-
+    
   if  (__builtin_expect (x < 0.0, 0))
     return __kernel_standard (x, x, 26); /* sqrt(negative) */
   else
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c 2007-06-03 15:51:38.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.c 2007-10-26 11:14:06.886064808 -0500
@@ -33,16 +33,51 @@
      float x;
 #endif
 {
-#ifdef _IEEE_LIBM
-  return __ieee754_sqrtf (x);
-#else
   float z;
-/* Power4 (ISA V2.0) and above implement sqrtf in hardware.  */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrts instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.  Extented checking is
+   only needed to set errno (via __kernel_standard) if the input value
+   is negative.
+  
+   The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+   to indicated that the input value was negative or NaN. Use Move to
+   Condition Register from FPSCR to copy the FPCC field to cr1.  The
+   branch on summary overflow transfers control to __w_sqrtf to process
+   any error conditions. Otherwise we can return the result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrtf as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
    __asm __volatile (
- " fsqrts %0,%1\n"
+ " fsqrts 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrtf\n"
+ " fmr %0,2\n"
  : "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+  return z;
+}
 
+
+/* This code gets control from the __sqrtf wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;  
+#else
   if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
     return z;
     
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.c Fri Oct 26 11:17:20 2007
@@ -0,0 +1,98 @@
+/* Double-precision floating point square root wrapper.
+   Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <math_ldbl_opt.h>
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#ifdef __STDC__
+double
+__sqrt (double x) /* wrapper sqrt */
+#else
+double
+__sqrt (x) /* wrapper sqrt */
+     double x;
+#endif
+{
+  double z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrt instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.
+  
+   Extented checking is only needed to set errno (via __kernel_standard)
+   if the input value is negative. So compare the input value against
+   the absolute value of itself. This will compare equal unless the
+   value is negative (EDOM) or a NAN, in which case we transfer (tail
+   call via branch) to the extend wrapper. If equal we can return the
+   result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrt as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
+   __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrt 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;
+#else
+  if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+    return z;
+    
+  if (__builtin_expect (x != x, 0))
+    return z;
+    
+  if  (__builtin_expect (x < 0.0, 0))
+    return __kernel_standard (x, x, 26); /* sqrt(negative) */
+  else
+    return z;
+#endif
+}
+
+weak_alias (__sqrt, sqrt)
+#ifdef NO_LONG_DOUBLE
+  strong_alias (__sqrt, __sqrtl) weak_alias (__sqrt, sqrtl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_0)
+compat_symbol (libm, __sqrt, sqrtl, GLIBC_2_0);
+#endif
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.c Fri Oct 26 11:14:14 2007
@@ -0,0 +1,95 @@
+/* Single-precision floating point square root wrapper.
+   Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+
+#ifdef __STDC__
+float
+__sqrtf (float x) /* wrapper sqrtf */
+#else
+float
+__sqrtf (x) /* wrapper sqrtf */
+     float x;
+#endif
+{
+  float z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrts instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.
+  
+   Extented checking is only needed to set errno (via __kernel_standard)
+   if the input value is negative. So compare the input value against
+   the absolute value of itself. This will compare equal unless the
+   value is negative (EDOM) or a NAN, in which case we transfer (tail
+   call via branch) to the extended wrapper. If equal we can return the
+   result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrtf as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
+   __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrts 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrtf wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;  
+#else
+  if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+    return z;
+    
+  if (__builtin_expect (x != x, 0))
+    return z;
+    
+  if  (__builtin_expect (x < 0.0, 0))
+    /* sqrtf(negative) */
+    return (float) __kernel_standard ((double) x, (double) x, 126);
+  else
+    return z;
+#endif
+}
+
+weak_alias (__sqrtf, sqrtf)
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c 2007-06-03 16:15:53.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrt.c 2007-10-24 17:13:16.336139032 -0500
@@ -32,11 +32,47 @@
 #endif
 {
   double z;
-/* Power4 (ISA V2.0) and above implement sqrt in hardware.  */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrt instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.  Extented checking is
+   only needed to set errno (via __kernel_standard) if the input value
+   is negative.
+  
+   The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+   to indicated that the input value was negative or NaN. Use Move to
+   Condition Register from FPSCR to copy the FPCC field to cr1.  The
+   branch on summary overflow transfers control to __w_sqrt to process
+   any error conditions. Otherwise we can return the result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrt as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
    __asm __volatile (
- " fsqrt %0,%1\n"
+ " fsqrt 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
  : "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
 #ifdef _IEEE_LIBM
   return z;
 #else
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c 2007-06-03 16:15:58.000000000 -0500
+++ libc25/sysdeps/powerpc/powerpc64/power4/fpu/w_sqrtf.c 2007-10-24 16:07:55.808110280 -0500
@@ -33,16 +33,51 @@
      float x;
 #endif
 {
-#ifdef _IEEE_LIBM
-  return __ieee754_sqrtf (x);
-#else
   float z;
-/* Power4 (ISA V2.0) and above implement sqrtf in hardware.  */
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrts instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.  Extented checking is
+   only needed to set errno (via __kernel_standard) if the input value
+   is negative.
+  
+   The fsqrt will set FPCC and FU (Floating Point Unordered or NaN
+   to indicated that the input value was negative or NaN. Use Move to
+   Condition Register from FPSCR to copy the FPCC field to cr1.  The
+   branch on summary overflow transfers control to __w_sqrtf to process
+   any error conditions. Otherwise we can return the result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrtf as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
    __asm __volatile (
- " fsqrts %0,%1\n"
+ " fsqrts 2,%1\n"
+ " mcrfs cr1,4\n"
+ " bso- cr1,__w_sqrtf\n"
+ " fmr %0,2\n"
  : "=f" (z)
- : "f" (x));
+ : "f" (x)
+ : "cr1", "fr2");
+
+  return z;
+}
 
+
+/* This code gets control from the __sqrtf wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;  
+#else
   if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
     return z;
     
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrt.c Fri Oct 26 11:12:12 2007
@@ -0,0 +1,98 @@
+/* Double-precision floating point square root wrapper.
+   Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <math_ldbl_opt.h>
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#ifdef __STDC__
+double
+__sqrt (double x) /* wrapper sqrt */
+#else
+double
+__sqrt (x) /* wrapper sqrt */
+     double x;
+#endif
+{
+  double z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrt instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.
+  
+   Extented checking is only needed to set errno (via __kernel_standard)
+   if the input value is negative. So compare the input value against
+   the absolute value of itself. This will compare equal unless the
+   value is negative (EDOM) or a NAN, in which case we transfer (tail
+   call via branch) to the extend wrapper. If equal we can return the
+   result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrt as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
+   __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrt 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrt wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+double
+__w_sqrt (double x, double z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;
+#else
+  if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+    return z;
+    
+  if (__builtin_expect (x != x, 0))
+    return z;
+    
+  if  (__builtin_expect (x < 0.0, 0))
+    return __kernel_standard (x, x, 26); /* sqrt(negative) */
+  else
+    return z;
+#endif
+}
+
+weak_alias (__sqrt, sqrt)
+#ifdef NO_LONG_DOUBLE
+  strong_alias (__sqrt, __sqrtl) weak_alias (__sqrt, sqrtl)
+#endif
+#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_3)
+compat_symbol (libm, __sqrt, sqrtl, GLIBC_2_3);
+#endif
diff -urN libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c
--- libc25-cvstip-20070919/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c Wed Dec 31 18:00:00 1969
+++ libc25/sysdeps/powerpc/powerpc64/power5/fpu/w_sqrtf.c Wed Oct 24 10:07:59 2007
@@ -0,0 +1,95 @@
+/* Single-precision floating point square root wrapper.
+   Copyright (C) 2004, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include "math.h"
+#include "math_private.h"
+#include <fenv_libc.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+
+#ifdef __STDC__
+float
+__sqrtf (float x) /* wrapper sqrtf */
+#else
+float
+__sqrtf (x) /* wrapper sqrtf */
+     float x;
+#endif
+{
+  float z;
+/* Power4 (ISA V2.0) and above implement sqrt in hardware (not optional).
+   The fsqrts instruction generates the correct value for all inputs and
+   sets the appropriate floating point exceptions.
+  
+   Extented checking is only needed to set errno (via __kernel_standard)
+   if the input value is negative. So compare the input value against
+   the absolute value of itself. This will compare equal unless the
+   value is negative (EDOM) or a NAN, in which case we transfer (tail
+   call via branch) to the extended wrapper. If equal we can return the
+   result directly.
+  
+   This function looks like a leaf routine,  so gcc will not stack a
+   frame or generate prologue/epilogue code. This means it is safe to
+   transfer directly to __w_sqrtf as long as the input value (f1) is
+   preserved. Putting the the sqrt result into f2 (float parameter 2)
+   allows passing both the input value and sqrt result into the extended
+   wrapper so there is no need to recompute.
+  
+   This tactic avoids the overhead of stacking a frame for the normal
+   (non-error) case.  Until gcc supports prologue shrink-wrapping
+   this is the best we can do.  */
+   __asm __volatile (
+ " fabs 0,%1\n"
+ " fsqrts 2,%1\n"
+ " fcmpu cr1,0,%1\n"
+ " bne- cr1,__w_sqrt\n"
+ " fmr %0,2\n"
+ : "=f" (z)
+ : "f" (x)
+ : "cr1", "fr0", "fr2");
+
+  return z;
+}
+
+
+/* This code gets control from the __sqrtf wrapper only if there are
+   errors that need to be reported. For example nagative input values
+   or NANs.  */
+float
+__w_sqrtf (float x, float z) /* wrapper sqrt errors */
+{
+#ifdef _IEEE_LIBM
+  return z;  
+#else
+  if (__builtin_expect (_LIB_VERSION == _IEEE_, 0))
+    return z;
+    
+  if (__builtin_expect (x != x, 0))
+    return z;
+    
+  if  (__builtin_expect (x < 0.0, 0))
+    /* sqrtf(negative) */
+    return (float) __kernel_standard ((double) x, (double) x, 126);
+  else
+    return z;
+#endif
+}
+
+weak_alias (__sqrtf, sqrtf)