[PATCH 00/16] Go closures for aarch64

classic Classic list List threaded Threaded
20 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH 00/16] Go closures for aarch64

Richard Henderson
This patch set fixes a compilation error since the iOS merge,
tidies up the port significantly, and finally adds support for
complex and Go closures.


r~


Richard Henderson (16):
  aarch64: Fix non-apple compilation
  aarch64: Improve is_hfa
  aarch64: Always distinguish LONGDOUBLE
  aarch64: Simplify AARCH64_STACK_ALIGN
  aarch64: Reduce the size of register_context
  aarch64: Use correct return registers
  aarch64: Treat void return as not passed in registers
  aarch64: Tidy up abi manipulation
  aarch64: Merge prep_args with ffi_call
  aarch64: Move return value handling into ffi_call_SYSV
  aarch64: Move return value handling into ffi_closure_SYSV
  aarch64: Unify scalar fp and hfa handling
  aarch64: Remove aarch64_flags
  aarch64: Add support for complex types
  aarch64: Move x8 out of call_context
  aarch64: Add support for Go closures

 src/aarch64/ffi.c              | 1477 ++++++++++++++++------------------------
 src/aarch64/ffitarget.h        |   14 +-
 src/aarch64/internal.h         |   67 ++
 src/aarch64/sysv.S             |  589 +++++++++-------
 testsuite/libffi.call/call.exp |   10 +-
 5 files changed, 1008 insertions(+), 1149 deletions(-)
 create mode 100644 src/aarch64/internal.h

--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 02/16] aarch64: Improve is_hfa

Richard Henderson
From: Richard Henderson <[hidden email]>

The set of functions get_homogeneous_type, element_count, and is_hfa
are all intertwined and recompute data.  Return a compound quantity
from is_hfa that contains all the data and avoids the recomputation.
---
 src/aarch64/ffi.c | 212 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 131 insertions(+), 81 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index cdb7816..0834614 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -242,88 +242,132 @@ is_floating_type (unsigned short type)
   || type == FFI_TYPE_LONGDOUBLE);
 }
 
-/* Test for a homogeneous structure.  */
+/* A subroutine of is_hfa.  Given a structure type, return the type code
+   of the first non-structure element.  Recurse for structure elements.
+   Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
-static unsigned short
-get_homogeneous_type (ffi_type *ty)
+static int
+is_hfa0 (const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned i;
-      unsigned short candidate_type
- = get_homogeneous_type (ty->elements[0]);
-      for (i =1; ty->elements[i]; i++)
- {
-  unsigned short iteration_type = 0;
-  /* If we have a nested struct, we must find its homogeneous type.
-     If that fits with our candidate type, we are still
-     homogeneous.  */
-  if (ty->elements[i]->type == FFI_TYPE_STRUCT
-      && ty->elements[i]->elements)
-    {
-      iteration_type = get_homogeneous_type (ty->elements[i]);
-    }
-  else
-    {
-      iteration_type = ty->elements[i]->type;
-    }
+  ffi_type **elements = ty->elements;
+  int i, ret = -1;
 
-  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
-  if (candidate_type != iteration_type)
-    return FFI_TYPE_STRUCT;
- }
-      return candidate_type;
-    }
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        ret = elements[i]->type;
+        if (ret == FFI_TYPE_STRUCT)
+          {
+            ret = is_hfa0 (elements[i]);
+            if (ret < 0)
+              continue;
+          }
+        break;
+      }
 
-  /* Base case, we have no more levels of nesting, so we
-     are a basic type, and so, trivially homogeneous in that type.  */
-  return ty->type;
+  return ret;
 }
 
-/* Determine the number of elements within a STRUCT.
+/* A subroutine of is_hfa.  Given a structure type, return true if all
+   of the non-structure elements are the same as CANDIDATE.  */
 
-   Note, we must handle nested structs.
+static int
+is_hfa1 (const ffi_type *ty, int candidate)
+{
+  ffi_type **elements = ty->elements;
+  int i;
 
-   If ty is not a STRUCT this function will return 0.  */
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        int t = elements[i]->type;
+        if (t == FFI_TYPE_STRUCT)
+          {
+            if (!is_hfa1 (elements[i], candidate))
+              return 0;
+          }
+        else if (t != candidate)
+          return 0;
+      }
 
-static unsigned
-element_count (ffi_type *ty)
-{
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned n;
-      unsigned elems = 0;
-      for (n = 0; ty->elements[n]; n++)
- {
-  if (ty->elements[n]->type == FFI_TYPE_STRUCT
-      && ty->elements[n]->elements)
-    elems += element_count (ty->elements[n]);
-  else
-    elems++;
- }
-      return elems;
-    }
-  return 0;
+  return 1;
 }
 
-/* Test for a homogeneous floating point aggregate.
+/* Determine if TY is an homogenous floating point aggregate (HFA).
+   That is, a structure consisting of 1 to 4 members of all the same type,
+   where that type is a floating point scalar.
 
-   A homogeneous floating point aggregate is a homogeneous aggregate of
-   a half- single- or double- precision floating point type with one
-   to four elements.  Note that this includes nested structs of the
-   basic type.  */
+   Returns non-zero iff TY is an HFA.  The result is an encoded value where
+   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
 
 static int
-is_hfa (ffi_type *ty)
+is_hfa(const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT
-      && ty->elements[0]
-      && is_floating_type (get_homogeneous_type (ty)))
+  ffi_type **elements;
+  int candidate, i;
+  size_t size, ele_count;
+
+  /* Quickest tests first.  */
+  if (ty->type != FFI_TYPE_STRUCT)
+    return 0;
+
+  /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
+  size = ty->size;
+  if (size < 4 || size > 64)
+    return 0;
+
+  /* Find the type of the first non-structure member.  */
+  elements = ty->elements;
+  candidate = elements[0]->type;
+  if (candidate == FFI_TYPE_STRUCT)
     {
-      unsigned n = element_count (ty);
-      return n >= 1 && n <= 4;
+      for (i = 0; ; ++i)
+        {
+          candidate = is_hfa0 (elements[i]);
+          if (candidate >= 0)
+            break;
+        }
     }
-  return 0;
+
+  /* If the first member is not a floating point type, it's not an HFA.
+     Also quickly re-check the size of the structure.  */
+  switch (candidate)
+    {
+    case FFI_TYPE_FLOAT:
+      ele_count = size / sizeof(float);
+      if (size != ele_count * sizeof(float))
+        return 0;
+      break;
+    case FFI_TYPE_DOUBLE:
+      ele_count = size / sizeof(double);
+      if (size != ele_count * sizeof(double))
+        return 0;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      ele_count = size / sizeof(long double);
+      if (size != ele_count * sizeof(long double))
+        return 0;
+      break;
+    default:
+      return 0;
+    }
+  if (ele_count > 4)
+    return 0;
+
+  /* Finally, make sure that all scalar elements are the same type.  */
+  for (i = 0; elements[i]; ++i)
+    {
+      if (elements[i]->type == FFI_TYPE_STRUCT)
+        {
+          if (!is_hfa1 (elements[i], candidate))
+            return 0;
+        }
+      else if (elements[i]->type != candidate)
+        return 0;
+    }
+
+  /* All tests succeeded.  Encode the result.  */
+  return (ele_count << 8) | candidate;
 }
 
 /* Test if an ffi_type is a candidate for passing in a register.
@@ -559,7 +603,10 @@ copy_hfa_to_reg_or_stack (void *memory,
   unsigned char *stack,
   struct arg_state *state)
 {
-  unsigned elems = element_count (ty);
+  int h = is_hfa (ty);
+  int type = h & 0xff;
+  unsigned elems = h >> 8;
+
   if (available_v (state) < elems)
     {
       /* There are insufficient V registers. Further V register allocations
@@ -573,7 +620,6 @@ copy_hfa_to_reg_or_stack (void *memory,
   else
     {
       int i;
-      unsigned short type = get_homogeneous_type (ty);
       for (i = 0; i < elems; i++)
  {
   void *reg = allocate_to_v (context, state);
@@ -813,6 +859,7 @@ void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
+  int h;
 
   ecif.cif = cif;
   ecif.avalue = avalue;
@@ -861,11 +908,12 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
  }
 
               case FFI_TYPE_STRUCT:
-                if (is_hfa (cif->rtype))
+ h = is_hfa (cif->rtype);
+                if (h)
   {
     int j;
-    unsigned short type = get_homogeneous_type (cif->rtype);
-    unsigned elems = element_count (cif->rtype);
+    int type = h & 0xff;
+    int elems = h >> 8;
     for (j = 0; j < elems; j++)
       {
  void *reg = get_basic_type_addr (type, &context, j);
@@ -967,7 +1015,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   void *rvalue = NULL;
-  int i;
+  int i, h;
   struct arg_state state;
 
   arg_init (&state, ALIGN(cif->bytes, 16));
@@ -1002,9 +1050,10 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 #endif
 
  case FFI_TYPE_STRUCT:
-  if (is_hfa (ty))
+  h = is_hfa (ty);
+  if (h)
     {
-      unsigned n = element_count (ty);
+      unsigned n = h >> 8;
       if (available_v (&state) < n)
  {
   state.nsrn = N_V_ARG_REG;
@@ -1013,7 +1062,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
  }
       else
  {
-  switch (get_homogeneous_type (ty))
+  switch (h & 0xff)
     {
     case FFI_TYPE_FLOAT:
       {
@@ -1027,9 +1076,9 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
    correctly. The fake can be tossed once the
    closure function has returned hence alloca()
    is sufficient. */
- int j;
+ unsigned j;
  UINT32 *p = avalue[i] = alloca (ty->size);
- for (j = 0; j < element_count (ty); j++)
+ for (j = 0; j < n; j++)
   memcpy (&p[j],
   allocate_to_s (context, &state),
   sizeof (*p));
@@ -1048,9 +1097,9 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
    correctly. The fake can be tossed once the
    closure function has returned hence alloca()
    is sufficient. */
- int j;
+ unsigned j;
  UINT64 *p = avalue[i] = alloca (ty->size);
- for (j = 0; j < element_count (ty); j++)
+ for (j = 0; j < n; j++)
   memcpy (&p[j],
   allocate_to_d (context, &state),
   sizeof (*p));
@@ -1143,11 +1192,12 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
             break;
   }
         case FFI_TYPE_STRUCT:
-          if (is_hfa (cif->rtype))
+  h = is_hfa (cif->rtype);
+          if (h)
     {
       int j;
-      unsigned short type = get_homogeneous_type (cif->rtype);
-      unsigned elems = element_count (cif->rtype);
+      int type = h & 0xff;
+      int elems = h >> 8;
       for (j = 0; j < elems; j++)
  {
   void *reg = get_basic_type_addr (type, context, j);
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 01/16] aarch64: Fix non-apple compilation

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

---
 src/aarch64/ffi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 5369ea4..cdb7816 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -782,7 +782,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
           }
     }
 
+#if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
+#endif
 
   return FFI_OK;
 }
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 03/16] aarch64: Always distinguish LONGDOUBLE

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

Avoid if-deffery by forcing FFI_TYPE_LONGDOUBLE different
from FFI_TYPE_DOUBLE.  This will simply be unused on hosts
that define them identically.
---
 src/aarch64/ffi.c | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 0834614..f065be5 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -20,11 +20,20 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
 #include <ffi.h>
 #include <ffi_common.h>
 
-#include <stdlib.h>
+/* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
+   all further uses in this file will refer to the 128-bit type.  */
+#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+# if FFI_TYPE_LONGDOUBLE != 4
+#  error FFI_TYPE_LONGDOUBLE out of date
+# endif
+#else
+# undef FFI_TYPE_LONGDOUBLE
+# define FFI_TYPE_LONGDOUBLE 4
+#endif
 
 /* Stack alignment requirement in bytes */
 #if defined (__APPLE__)
@@ -115,10 +124,8 @@ get_basic_type_addr (unsigned short type, struct call_context *context,
       return get_s_addr (context, n);
     case FFI_TYPE_DOUBLE:
       return get_d_addr (context, n);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return get_v_addr (context, n);
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
@@ -151,10 +158,8 @@ get_basic_type_alignment (unsigned short type)
 #endif
     case FFI_TYPE_DOUBLE:
       return sizeof (UINT64);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return sizeof (long double);
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
 #if defined (__APPLE__)
@@ -193,10 +198,8 @@ get_basic_type_size (unsigned short type)
       return sizeof (UINT32);
     case FFI_TYPE_DOUBLE:
       return sizeof (UINT64);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return sizeof (long double);
-#endif
     case FFI_TYPE_UINT8:
       return sizeof (UINT8);
     case FFI_TYPE_SINT8:
@@ -390,9 +393,7 @@ is_register_candidate (ffi_type *ty)
     case FFI_TYPE_VOID:
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_UINT16:
     case FFI_TYPE_UINT32:
@@ -557,11 +558,9 @@ copy_basic_type (void *dest, void *source, unsigned short type)
     case FFI_TYPE_DOUBLE:
       *(double *) dest = *(double *) source;
       break;
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       *(long double *) dest = *(long double *) source;
       break;
-#endif
     case FFI_TYPE_UINT8:
       *(ffi_arg *) dest = *(UINT8 *) source;
       break;
@@ -653,13 +652,11 @@ allocate_to_register_or_stack (struct call_context *context,
  return allocate_to_d (context, state);
       state->nsrn = N_V_ARG_REG;
       break;
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       if (state->nsrn < N_V_ARG_REG)
  return allocate_to_v (context, state);
       state->nsrn = N_V_ARG_REG;
       break;
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
@@ -722,9 +719,7 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
    appropriate register, or if none are available, to the stack.  */
  case FFI_TYPE_FLOAT:
  case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
  case FFI_TYPE_LONGDOUBLE:
-#endif
  case FFI_TYPE_UINT8:
  case FFI_TYPE_SINT8:
  case FFI_TYPE_UINT16:
@@ -887,9 +882,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
               case FFI_TYPE_VOID:
               case FFI_TYPE_FLOAT:
               case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
               case FFI_TYPE_LONGDOUBLE:
-#endif
               case FFI_TYPE_UINT8:
               case FFI_TYPE_SINT8:
               case FFI_TYPE_UINT16:
@@ -1040,14 +1033,12 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
  case FFI_TYPE_POINTER:
  case FFI_TYPE_UINT64:
  case FFI_TYPE_SINT64:
- case  FFI_TYPE_FLOAT:
- case  FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
- case  FFI_TYPE_LONGDOUBLE:
+ case FFI_TYPE_FLOAT:
+ case FFI_TYPE_DOUBLE:
+ case FFI_TYPE_LONGDOUBLE:
   avalue[i] = allocate_to_register_or_stack (context, stack,
      &state, ty->type);
   break;
-#endif
 
  case FFI_TYPE_STRUCT:
   h = is_hfa (ty);
@@ -1106,13 +1097,11 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
  break;
       }
 
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
   memcpy (&avalue[i],
   allocate_to_v (context, &state),
   sizeof (*avalue));
       break;
-#endif
 
     default:
       FFI_ASSERT (0);
@@ -1183,9 +1172,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
         case FFI_TYPE_SINT64:
         case FFI_TYPE_FLOAT:
         case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
         case FFI_TYPE_LONGDOUBLE:
-#endif
   {
     void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
     copy_basic_type (addr, rvalue, cif->rtype->type);
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 04/16] aarch64: Simplify AARCH64_STACK_ALIGN

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

The iOS abi doesn't require padding between arguments, but
that's not what AARCH64_STACK_ALIGN meant.  The hardware will
in fact trap if the SP register is not 16 byte aligned.
---
 src/aarch64/ffi.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f065be5..a6fcc11 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -35,13 +35,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 # define FFI_TYPE_LONGDOUBLE 4
 #endif
 
-/* Stack alignment requirement in bytes */
-#if defined (__APPLE__)
-#define AARCH64_STACK_ALIGN 1
-#else
-#define AARCH64_STACK_ALIGN 16
-#endif
-
 #define N_X_ARG_REG 8
 #define N_V_ARG_REG 8
 
@@ -799,8 +792,7 @@ ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
   /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes =
-    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
+  cif->bytes = ALIGN(cif->bytes, 16);
 
   /* Initialize our flags. We are interested if this CIF will touch a
      vector register, if so we will enable context save and load to
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 05/16] aarch64: Reduce the size of register_context

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

We don't need to store 32 general and vector registers.
Only 8 of each are used for parameter passing.
---
 src/aarch64/ffi.c       |  35 ++++++++---------
 src/aarch64/ffitarget.h |   6 ---
 src/aarch64/internal.h  |  26 +++++++++++++
 src/aarch64/sysv.S      | 100 +++++++++++++++++++++++-------------------------
 4 files changed, 91 insertions(+), 76 deletions(-)
 create mode 100644 src/aarch64/internal.h

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index a6fcc11..58d088b 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -21,8 +21,10 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
 /* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
    all further uses in this file will refer to the 128-bit type.  */
@@ -35,38 +37,35 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 # define FFI_TYPE_LONGDOUBLE 4
 #endif
 
-#define N_X_ARG_REG 8
-#define N_V_ARG_REG 8
-
-#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
-
 union _d
 {
   UINT64 d;
   UINT32 s[2];
 };
 
+struct _v
+{
+  union _d d[2] __attribute__((aligned(16)));
+};
+
 struct call_context
 {
-  UINT64 x [AARCH64_N_XREG];
-  struct
-  {
-    union _d d[2];
-  } v [AARCH64_N_VREG];
+  struct _v v[N_V_ARG_REG];
+  UINT64 x[N_X_ARG_REG];
+  UINT64 x8;
 };
 
 #if defined (__clang__) && defined (__APPLE__)
-extern void
-sys_icache_invalidate (void *start, size_t len);
+extern void sys_icache_invalidate (void *start, size_t len);
 #endif
 
 static inline void
 ffi_clear_cache (void *start, void *end)
 {
 #if defined (__clang__) && defined (__APPLE__)
- sys_icache_invalidate (start, (char *)end - (char *)start);
+  sys_icache_invalidate (start, (char *)end - (char *)start);
 #elif defined (__GNUC__)
- __builtin___clear_cache (start, end);
+  __builtin___clear_cache (start, end);
 #else
 #error "Missing builtin to flush instruction cache"
 #endif
@@ -802,7 +801,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
   if (is_v_register_candidate (cif->rtype))
     {
-      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
     }
   else
     {
@@ -810,7 +809,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       for (i = 0; i < cif->nargs; i++)
         if (is_v_register_candidate (cif->arg_types[i]))
           {
-            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
             break;
           }
     }
@@ -924,7 +923,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
           }
         else
           {
-            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
+    context.x8 = (uintptr_t)rvalue;
             ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
    stack_bytes, fn);
           }
@@ -1201,7 +1200,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
     }
   else
     {
-      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
+      rvalue = (void *)(uintptr_t)context->x8;
       (closure->fun) (cif, rvalue, avalue, closure->user_data);
     }
 }
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 4bbced2..336f28a 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -54,10 +54,4 @@ typedef enum ffi_abi
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
 #endif
 
-#define AARCH64_FFI_WITH_V_BIT 0
-
-#define AARCH64_N_XREG 32
-#define AARCH64_N_VREG 32
-#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16)
-
 #endif
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
new file mode 100644
index 0000000..b6b6104
--- /dev/null
+++ b/src/aarch64/internal.h
@@ -0,0 +1,26 @@
+/*
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#define AARCH64_FLAG_ARG_V_BIT 0
+#define AARCH64_FLAG_ARG_V (1 << AARCH64_FLAG_ARG_V_BIT)
+
+#define N_X_ARG_REG 8
+#define N_V_ARG_REG 8
+#define CALL_CONTEXT_SIZE (N_V_ARG_REG * 16 + N_X_ARG_REG * 8 + 16)
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 169eab8..70870db 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -22,6 +22,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
 
 #ifdef HAVE_MACHINE_ASM_H
 #include <machine/asm.h>
@@ -43,13 +44,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg
 
         .text
+        .align 2
+
         .globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
         .type CNAME(ffi_call_SYSV), #function
 #endif
-#ifdef __APPLE__
-        .align 2
-#endif
 
 /* ffi_call_SYSV()
 
@@ -142,42 +142,40 @@ CNAME(ffi_call_SYSV):
         mov     x23, x0
 
         /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #8*32 +  0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
+        ldp     q0, q1, [x21, #0]
+        ldp     q2, q3, [x21, #32]
+        ldp     q4, q5, [x21, #64]
+        ldp     q6, q7, [x21, #96]
 1:
-        /* Load the core argument passing registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-
-        /* Don't forget x8 which may be holding the address of a return buffer.
- */
-        ldr     x8,     [x21, #8*8]
+        /* Load the core argument passing registers, including
+   the structure return pointer.  */
+        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
+        ldr     x8,     [x21, #16*N_V_ARG_REG + 64]
 
         blr     x24
 
         /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21,  #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
+        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
 
         /* Note nothing useful ever comes back in x8!  */
 
         /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Save the vector argument passing registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+        stp     q0, q1, [x21, #0]
+        stp     q2, q3, [x21, #32]
+        stp     q4, q5, [x21, #64]
+        stp     q6, q7, [x21, #96]
 1:
         /* All done, unwind our stack frame.  */
         ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
@@ -203,7 +201,7 @@ CNAME(ffi_call_SYSV):
         .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
 
-#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+#define ffi_closure_SYSV_FS (8 * 2 + CALL_CONTEXT_SIZE)
 
 /* ffi_closure_SYSV
 
@@ -243,10 +241,9 @@ CNAME(ffi_call_SYSV):
    Voila!  */
 
         .text
-        .globl CNAME(ffi_closure_SYSV)
-#ifdef __APPLE__
         .align 2
-#endif
+
+        .globl CNAME(ffi_closure_SYSV)
         .cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
@@ -268,24 +265,23 @@ CNAME(ffi_closure_SYSV):
         /* Preserve our struct trampoline_data *  */
         mov     x22, x17
 
-        /* Save the rest of the argument passing registers.  */
-        stp     x0, x1, [x21, #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-        /* Don't forget we may have been given a result scratch pad address.
- */
-        str     x8,     [x21, #64]
+        /* Save the rest of the argument passing registers, including
+   the structure return pointer.  */
+        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
+        str     x8,     [x21, #16*N_V_ARG_REG + 64]
 
         /* Figure out if we should touch the vector registers.  */
         ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+        stp     q0, q1, [x21, #0]
+        stp     q2, q3, [x21, #32]
+        stp     q4, q5, [x21, #64]
+        stp     q6, q7, [x21, #96]
 1:
         /* Load &ffi_closure..  */
         ldr     x0, [x22, #0]
@@ -298,19 +294,19 @@ CNAME(ffi_closure_SYSV):
 
         /* Figure out if we should touch the vector registers.  */
         ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #8*32 + 0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
+        ldp     q0, q1, [x21, #0]
+        ldp     q2, q3, [x21, #32]
+        ldp     q4, q5, [x21, #64]
+        ldp     q6, q7, [x21, #96]
 1:
         /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
+        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
         /* Note nothing useful is returned in x8.  */
 
         /* We are done, unwind our frame.  */
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 07/16] aarch64: Treat void return as not passed in registers

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

This lets us do less post-processing when there's no return value.
---
 src/aarch64/ffi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 58d088b..6c338e1 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -383,6 +383,7 @@ is_register_candidate (ffi_type *ty)
   switch (ty->type)
     {
     case FFI_TYPE_VOID:
+      return 0;
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
     case FFI_TYPE_LONGDOUBLE:
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 06/16] aarch64: Use correct return registers

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

There are fewer return registers than argument registers.
---
 src/aarch64/sysv.S | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 70870db..fa7ff5b 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -160,22 +160,15 @@ CNAME(ffi_call_SYSV):
 
         blr     x24
 
-        /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-
-        /* Note nothing useful ever comes back in x8!  */
+        /* Save the core return registers.  */
+        stp     x0, x1, [x21, #16*N_V_ARG_REG]
 
         /* Figure out if we should touch the vector registers.  */
         tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
-        /* Save the vector argument passing registers.  */
+        /* Save the vector return registers.  */
         stp     q0, q1, [x21, #0]
         stp     q2, q3, [x21, #32]
-        stp     q4, q5, [x21, #64]
-        stp     q6, q7, [x21, #96]
 1:
         /* All done, unwind our stack frame.  */
         ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
@@ -299,15 +292,9 @@ CNAME(ffi_closure_SYSV):
         /* Load the result passing vector registers.  */
         ldp     q0, q1, [x21, #0]
         ldp     q2, q3, [x21, #32]
-        ldp     q4, q5, [x21, #64]
-        ldp     q6, q7, [x21, #96]
 1:
         /* Load the result passing core registers.  */
         ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        /* Note nothing useful is returned in x8.  */
 
         /* We are done, unwind our frame.  */
         ldp     x21, x22, [x29,  #-16]
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 09/16] aarch64: Merge prep_args with ffi_call

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

Use the trick to allocate the stack frame for ffi_call_SYSV
within ffi_call itself.
---
 src/aarch64/ffi.c  | 193 ++++++++++++++++++++++++-----------------------------
 src/aarch64/sysv.S | 192 ++++++++++++++++------------------------------------
 2 files changed, 144 insertions(+), 241 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index d19384b..a067303 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -72,14 +72,6 @@ ffi_clear_cache (void *start, void *end)
 }
 
 extern void
-ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
-    extended_cif *),
-               struct call_context *context,
-               extended_cif *,
-               size_t,
-               void (*fn)(void));
-
-extern void
 ffi_closure_SYSV (ffi_closure *);
 
 /* Test for an FFI floating point representation.  */
@@ -311,12 +303,11 @@ struct arg_state
 
 /* Initialize a procedure call argument marshalling state.  */
 static void
-arg_init (struct arg_state *state, size_t call_frame_size)
+arg_init (struct arg_state *state)
 {
   state->ngrn = 0;
   state->nsrn = 0;
   state->nsaa = 0;
-
 #if defined (__APPLE__)
   state->allocating_variadic = 0;
 #endif
@@ -529,27 +520,88 @@ allocate_int_to_reg_or_stack (struct call_context *context,
   return allocate_to_stack (state, stack, size, size);
 }
 
-/* Marshall the arguments from FFI representation to procedure call
-   context and stack.  */
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN(cif->bytes, 16);
 
-static unsigned
-aarch64_prep_args (struct call_context *context, unsigned char *stack,
-   extended_cif *ecif)
+  /* Initialize our flags. We are interested if this CIF will touch a
+     vector register, if so we will enable context save and load to
+     those registers, otherwise not. This is intended to be friendly
+     to lazy float context switching in the kernel.  */
+  cif->aarch64_flags = 0;
+
+  if (is_v_register_candidate (cif->rtype))
+    {
+      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
+    }
+  else
+    {
+      int i;
+      for (i = 0; i < cif->nargs; i++)
+        if (is_v_register_candidate (cif->arg_types[i]))
+          {
+            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
+            break;
+          }
+    }
+
+#if defined (__APPLE__)
+  cif->aarch64_nfixedargs = 0;
+#endif
+
+  return FFI_OK;
+}
+
+#if defined (__APPLE__)
+
+/* Perform Apple-specific cif processing for variadic calls */
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
+    unsigned int nfixedargs,
+    unsigned int ntotalargs)
 {
-  ffi_cif *cif = ecif->cif;
-  void **avalue = ecif->avalue;
-  int i, nargs = cif->nargs;
+  ffi_status status;
+
+  status = ffi_prep_cif_machdep (cif);
+
+  cif->aarch64_nfixedargs = nfixedargs;
+
+  return status;
+}
+
+#endif
+
+extern void ffi_call_SYSV (void *stack, void *frame,
+   void (*fn)(void), int flags) FFI_HIDDEN;
+
+/* Call a function with the provided arguments and capture the return
+   value.  */
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  struct call_context *context;
+  void *stack, *frame;
   struct arg_state state;
+  size_t stack_bytes;
+  int i, nargs = cif->nargs;
+  int h, t;
+  ffi_type *rtype;
 
-  arg_init (&state, cif->bytes);
+  /* Allocate consectutive stack for everything we'll need.  */
+  stack_bytes = cif->bytes;
+  stack = alloca (stack_bytes + 32 + sizeof(struct call_context));
+  frame = stack + stack_bytes;
+  context = frame + 32;
 
+  arg_init (&state);
   for (i = 0; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       size_t s = ty->size;
-      int h, t = ty->type;
       void *a = avalue[i];
 
+      t = ty->type;
       switch (t)
  {
  case FFI_TYPE_VOID:
@@ -665,83 +717,12 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 #endif
     }
 
-  return cif->aarch64_flags;
-}
-
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(cif->bytes, 16);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-            break;
-          }
-    }
-
-#if defined (__APPLE__)
-  cif->aarch64_nfixedargs = 0;
-#endif
-
-  return FFI_OK;
-}
-
-#if defined (__APPLE__)
-
-/* Perform Apple-specific cif processing for variadic calls */
-ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
-    unsigned int nfixedargs,
-    unsigned int ntotalargs)
-{
-  ffi_status status;
-
-  status = ffi_prep_cif_machdep (cif);
-
-  cif->aarch64_nfixedargs = nfixedargs;
-
-  return status;
-}
-
-#endif
-
-/* Call a function with the provided arguments and capture the return
-   value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  extended_cif ecif;
-  struct call_context context;
-  size_t stack_bytes;
-  int h, t;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
-
-  stack_bytes = cif->bytes;
-
-  memset (&context, 0, sizeof (context));
-  if (is_register_candidate (cif->rtype))
+  rtype = cif->rtype;
+  if (is_register_candidate (rtype))
     {
-      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
+      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
 
-      t = cif->rtype->type;
+      t = rtype->type;
       switch (t)
  {
  case FFI_TYPE_INT:
@@ -754,33 +735,35 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
  case FFI_TYPE_POINTER:
  case FFI_TYPE_UINT64:
  case FFI_TYPE_SINT64:
-  *(ffi_arg *)rvalue = extend_integer_type (&context.x[0], t);
+  *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t);
   break;
 
  case FFI_TYPE_FLOAT:
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
-  compress_hfa_type (rvalue, &context.v[0], 0x100 + t);
+  compress_hfa_type (rvalue, &context->v[0], 0x100 + t);
   break;
 
  case FFI_TYPE_STRUCT:
   h = is_hfa (cif->rtype);
   if (h)
-    compress_hfa_type (rvalue, &context.v[0], h);
-  else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-    memcpy (rvalue, &context.x[0], cif->rtype->size);
+    compress_hfa_type (rvalue, &context->v[0], h);
   else
-    abort();
+    {
+      FFI_ASSERT (rtype->size <= 16);
+      memcpy (rvalue, &context->x[0], rtype->size);
+    }
   break;
 
  default:
-  abort();
+  FFI_ASSERT (0);
+  break;
  }
     }
   else
     {
-      context.x8 = (uintptr_t)rvalue;
-      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
+      context->x8 = (uintptr_t)rvalue;
+      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
     }
 }
 
@@ -851,7 +834,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   struct arg_state state;
   ffi_type *rtype;
 
-  arg_init (&state, ALIGN(cif->bytes, 16));
+  arg_init (&state);
 
   for (i = 0; i < nargs; i++)
     {
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index fa7ff5b..a5f636a 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -22,6 +22,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 #include "internal.h"
 
 #ifdef HAVE_MACHINE_ASM_H
@@ -38,158 +39,77 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 #endif
 
-#define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
-#define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
-#define cfi_restore(reg) .cfi_restore reg
-#define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg
+ .text
+ .align 2
 
-        .text
-        .align 2
-
-        .globl CNAME(ffi_call_SYSV)
+ .globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
-        .type CNAME(ffi_call_SYSV), #function
+ .type CNAME(ffi_call_SYSV), #function
+ .hidden CNAME(ffi_call_SYSV)
 #endif
 
-/* ffi_call_SYSV()
-
-   Create a stack frame, setup an argument context, call the callee
-   and extract the result.
-
-   The maximum required argument stack size is provided,
-   ffi_call_SYSV() allocates that stack space then calls the
-   prepare_fn to populate register context and stack.  The
-   argument passing registers are loaded from the register
-   context and the callee called, on return the register passing
-   register are saved back to the context.  Our caller will
-   extract the return value from the final state of the saved
-   register context.
-
-   Prototype:
-
-   extern unsigned
-   ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
-   extended_cif *),
-                  struct call_context *context,
-                  extended_cif *,
-                  size_t required_stack_size,
-                  void (*fn)(void));
+/* ffi_call_SYSV
+   extern void ffi_call_SYSV (void *stack, void *frame,
+      void (*fn)(void), int flags);
 
    Therefore on entry we have:
 
-   x0 prepare_fn
-   x1 &context
-   x2 &ecif
-   x3 bytes
-   x4 fn
-
-   This function uses the following stack frame layout:
+   x0 stack
+   x1 frame
+   x2 fn
+   x3 flags
+*/
 
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x24
-                saved x23
-                saved x22
-   sp'    ->    saved x21
-                ...
-   sp     ->    (constructed callee stack arguments)
-   ==
-
-   Voila! */
-
-#define ffi_call_SYSV_FS (8 * 4)
-
-        .cfi_startproc
+ cfi_startproc
 CNAME(ffi_call_SYSV):
-        stp     x29, x30, [sp, #-16]!
- cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
- cfi_def_cfa_register (x29)
-        sub     sp, sp, #ffi_call_SYSV_FS
-
-        stp     x21, x22, [sp, #0]
-        cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS)
-
-        stp     x23, x24, [sp, #16]
-        cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS)
-
-        mov     x21, x1
-        mov     x22, x2
-        mov     x24, x4
-
-        /* Allocate the stack space for the actual arguments, many
-           arguments will be passed in registers, but we assume
-           worst case and allocate sufficient stack for ALL of
-           the arguments.  */
-        sub     sp, sp, x3
-
-        /* unsigned (*prepare_fn) (struct call_context *context,
-   unsigned char *stack, extended_cif *ecif);
- */
-        mov     x23, x0
-        mov     x0, x1
-        mov     x1, sp
-        /* x2 already in place */
-        blr     x23
-
-        /* Preserve the flags returned.  */
-        mov     x23, x0
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #0]
-        ldp     q2, q3, [x21, #32]
-        ldp     q4, q5, [x21, #64]
-        ldp     q6, q7, [x21, #96]
+ /* Use a stack frame allocated by our caller.  */
+ cfi_def_cfa(x1, 32);
+ stp x29, x30, [x1]
+ mov x29, x1
+ mov sp, x0
+ cfi_def_cfa_register(x29)
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
+
+ str w3, [x29, #16] /* save flags */
+ mov x9, x2 /* save fn */
+
+ /* Load the vector argument passing registers, if necessary.  */
+ tbz w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+ ldp     q0, q1, [x29, #32 + 0]
+ ldp     q2, q3, [x29, #32 + 32]
+ ldp     q4, q5, [x29, #32 + 64]
+ ldp     q6, q7, [x29, #32 + 96]
 1:
-        /* Load the core argument passing registers, including
+ /* Load the core argument passing registers, including
    the structure return pointer.  */
-        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        ldr     x8,     [x21, #16*N_V_ARG_REG + 64]
-
-        blr     x24
+ ldp     x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0]
+ ldp     x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16]
+ ldp     x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32]
+ ldp     x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48]
+ ldr     x8,     [x29, #32 + 16*N_V_ARG_REG + 64]
 
-        /* Save the core return registers.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG]
+ blr     x9 /* call fn */
 
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
+ ldr w3, [x29, #16] /* reload flags */
 
-        /* Save the vector return registers.  */
-        stp     q0, q1, [x21, #0]
-        stp     q2, q3, [x21, #32]
-1:
-        /* All done, unwind our stack frame.  */
-        ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        ldp     x23, x24, [x29,  # - ffi_call_SYSV_FS + 16]
-        cfi_restore (x23)
-        cfi_restore (x24)
-
-        mov     sp, x29
+ /* Partially deconstruct the stack frame.  */
+ mov     sp, x29
  cfi_def_cfa_register (sp)
+ ldp     x29, x30, [x29]
 
-        ldp     x29, x30, [sp], #16
- cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
+ /* Save the core return registers.  */
+ stp     x0, x1, [sp, #32 + 16*N_V_ARG_REG]
 
-        ret
+ /* Save the vector return registers, if necessary.  */
+ tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+ stp     q0, q1, [sp, #32 + 0]
+ stp     q2, q3, [sp, #32 + 32]
+1:
+ /* All done.  */
+ ret
 
-        .cfi_endproc
+ cfi_endproc
 #ifdef __ELF__
         .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
@@ -237,7 +157,7 @@ CNAME(ffi_call_SYSV):
         .align 2
 
         .globl CNAME(ffi_closure_SYSV)
-        .cfi_startproc
+        cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
  cfi_adjust_cfa_offset (16)
@@ -310,7 +230,7 @@ CNAME(ffi_closure_SYSV):
         cfi_restore (x30)
 
         ret
-        .cfi_endproc
+ cfi_endproc
 #ifdef __ELF__
         .size CNAME(ffi_closure_SYSV), .-CNAME(ffi_closure_SYSV)
 #endif
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 08/16] aarch64: Tidy up abi manipulation

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

Avoid false abstraction, like get_x_addr.  Avoid recomputing data
about the type being manipulated.  Use NEON insns for HFA manipulation.

Note that some of the inline assembly will go away in a subsequent patch.
---
 src/aarch64/ffi.c | 932 +++++++++++++++++++++---------------------------------
 1 file changed, 367 insertions(+), 565 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 6c338e1..d19384b 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,152 +71,6 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-static void *
-get_x_addr (struct call_context *context, unsigned n)
-{
-  return &context->x[n];
-}
-
-static void *
-get_s_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1].s[1];
-#else
-  return &context->v[n].d[0].s[0];
-#endif
-}
-
-static void *
-get_d_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1];
-#else
-  return &context->v[n].d[0];
-#endif
-}
-
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    case FFI_TYPE_VOID:
-      return NULL;
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-#if defined (__APPLE__)
-      return sizeof (UINT32);
-#endif
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-#if defined (__APPLE__)
-  return sizeof (UINT8);
-#endif
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-#if defined (__APPLE__)
-  return sizeof (UINT16);
-#endif
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-#if defined (__APPLE__)
-  return sizeof (UINT32);
-#endif
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
-/* Return the size in bytes for each of the basic types.  */
-
-static size_t
-get_basic_type_size (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
 extern void
 ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
     extended_cif *),
@@ -468,223 +322,211 @@ arg_init (struct arg_state *state, size_t call_frame_size)
 #endif
 }
 
-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG);
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_v_addr (context, (state->nsrn)++);
-}
-
 /* Allocate an aligned slot on the stack and return a pointer to it.  */
 static void *
-allocate_to_stack (struct arg_state *state, void *stack, size_t alignment,
-   size_t size)
+allocate_to_stack (struct arg_state *state, void *stack,
+   size_t alignment, size_t size)
 {
-  void *allocation;
+  size_t nsaa = state->nsaa;
 
   /* Round up the NSAA to the larger of 8 or the natural
      alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
 #if defined (__APPLE__)
-  if (state->allocating_variadic)
-    state->nsaa = ALIGN (state->nsaa, 8);
+  if (state->allocating_variadic && alignment < 8)
+    alignment = 8;
 #else
-  state->nsaa = ALIGN (state->nsaa, 8);
+  if (alignment < 8)
+    alignment = 8;
 #endif
+    
+  nsaa = ALIGN (nsaa, alignment);
+  state->nsaa = nsaa + size;
 
-  allocation = stack + state->nsaa;
-
-  state->nsaa += size;
-  return allocation;
+  return (char *)stack + nsaa;
 }
 
-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+static ffi_arg
+extend_integer_type (void *source, int type)
 {
-  /* This is necessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
-    case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
-      break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
-      break;
+      return *(UINT8 *) source;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
-      break;
+      return *(SINT8 *) source;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
-      break;
+      return *(UINT16 *) source;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
-      break;
+      return *(SINT16 *) source;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
-      break;
+      return *(UINT32 *) source;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
-      break;
-    case FFI_TYPE_POINTER:
+      return *(SINT32 *) source;
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
-      break;
-    case FFI_TYPE_VOID:
+      return *(UINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      return *(uintptr_t *) source;
     default:
-      FFI_ASSERT (0);
+      abort();
     }
 }
 
 static void
-copy_hfa_to_reg_or_stack (void *memory,
-  ffi_type *ty,
-  struct call_context *context,
-  unsigned char *stack,
-  struct arg_state *state)
-{
-  int h = is_hfa (ty);
-  int type = h & 0xff;
-  unsigned elems = h >> 8;
-
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
- are prevented, the NSAA is adjusted (by allocate_to_stack ())
- and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-      memory,
-      ty->size);
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < elems; i++)
- {
-  void *reg = allocate_to_v (context, state);
-  copy_basic_type (reg, memory, type);
-  memory += get_basic_type_size (type);
- }
-    }
+extend_hfa_type (void *dest, void *src, int h)
+{
+  int n = (h >> 8);
+  int t = h & 0xff;
+  int f = (t - FFI_TYPE_FLOAT) * 4 + 4 - n;
+  void *x0;
+
+  asm volatile (
+ "adr %0, 0f\n"
+" add %0, %0, %1\n"
+" br %0\n"
+"0: ldp s16, s17, [%3]\n" /* S4 */
+" ldp s18, s19, [%3, #8]\n"
+" b 4f\n"
+" ldp s16, s17, [%3]\n" /* S3 */
+" ldr s18, [%3, #8]\n"
+" b 3f\n"
+" ldp s16, s17, [%3]\n" /* S2 */
+" b 2f\n"
+" nop\n"
+" ldr s16, [%3]\n" /* S1 */
+" b 1f\n"
+" nop\n"
+" ldp d16, d17, [%3]\n" /* D4 */
+" ldp d18, d19, [%3, #16]\n"
+" b 4f\n"
+" ldp d16, d17, [%3]\n" /* D3 */
+" ldr d18, [%3, #16]\n"
+" b 3f\n"
+" ldp d16, d17, [%3]\n" /* D2 */
+" b 2f\n"
+" nop\n"
+" ldr d16, [%3]\n" /* D1 */
+" b 1f\n"
+" nop\n"
+" ldp q16, q17, [%3]\n" /* Q4 */
+" ldp q18, q19, [%3, #16]\n"
+" b 4f\n"
+" ldp q16, q17, [%3]\n" /* Q3 */
+" ldr q18, [%3, #16]\n"
+" b 3f\n"
+" ldp q16, q17, [%3]\n" /* Q2 */
+" b 2f\n"
+" nop\n"
+" ldr q16, [%3]\n" /* Q1 */
+" b 1f\n"
+"4: str q19, [%2, #48]\n"
+"3: str q18, [%2, #32]\n"
+"2: str q17, [%2, #16]\n"
+"1: str q16, [%2]"
+    : "=&r"(x0)
+    : "r"(f * 12), "r"(dest), "r"(src)
+    : "memory", "v16", "v17", "v18", "v19");
 }
 
-/* Either allocate an appropriate register for the argument type, or if
-   none are available, allocate a stack slot and return a pointer
-   to the allocated space.  */
-
 static void *
-allocate_to_register_or_stack (struct call_context *context,
-       unsigned char *stack,
-       struct arg_state *state,
-       unsigned short type)
+compress_hfa_type (void *dest, void *reg, int h)
 {
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  int n = h >> 8;
+  switch (h & 0xff)
     {
     case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
- should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
+      switch (n)
+ {
+ default:
+  if (dest == reg)
+    {
+#ifdef __AARCH64EB__
+      dest += 12;
+#endif
+    }
+  else
+    *(float *)dest = *(float *)reg;
+  break;
+ case 2:
+  asm("ldp q16, q17, [%1]\n\t"
+      "st2 { v16.s, v17.s }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+  break;
+ case 3:
+  asm("ldp q16, q17, [%1]\n\t"
+      "ldr q18, [%1, #32]\n\t"
+      "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+  break;
+ case 4:
+  asm("ldp q16, q17, [%1]\n\t"
+      "ldp q18, q19, [%1, #32]\n\t"
+      "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+  break;
+ }
+      break;
+
     case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
- return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
+      switch (n)
+ {
+ default:
+  if (dest == reg)
+    {
+#ifdef __AARCH64EB__
+      dest += 8;
+#endif
+    }
+  else
+    *(double *)dest = *(double *)reg;
+  break;
+ case 2:
+  asm("ldp q16, q17, [%1]\n\t"
+      "st2 { v16.d, v17.d }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+  break;
+ case 3:
+  asm("ldp q16, q17, [%1]\n\t"
+      "ldr q18, [%1, #32]\n\t"
+      "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+  break;
+ case 4:
+  asm("ldp q16, q17, [%1]\n\t"
+      "ldp q18, q19, [%1, #32]\n\t"
+      "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+  break;
+ }
       break;
+
     case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
- return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
- return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+      if (dest != reg)
+ return memcpy (dest, reg, 16 * n);
       break;
+
     default:
       FFI_ASSERT (0);
     }
-
-    return allocate_to_stack (state, stack, alignment, size);
+  return dest;
 }
 
-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+/* Either allocate an appropriate register for the argument type, or if
+   none are available, allocate a stack slot and return a pointer
+   to the allocated space.  */
 
-static void
-copy_to_register_or_stack (struct call_context *context,
-   unsigned char *stack,
-   struct arg_state *state,
-   void *value,
-   unsigned short type)
+static void *
+allocate_int_to_reg_or_stack (struct call_context *context,
+      struct arg_state *state,
+      void *stack, size_t size)
 {
-  copy_basic_type (
-  allocate_to_register_or_stack (context, stack, state, type),
-  value,
-  type);
+  if (state->ngrn < N_X_ARG_REG)
+    return &context->x[state->ngrn++];
+
+  state->ngrn = N_X_ARG_REG;
+  return allocate_to_stack (state, stack, size, size);
 }
 
 /* Marshall the arguments from FFI representation to procedure call
@@ -694,15 +536,21 @@ static unsigned
 aarch64_prep_args (struct call_context *context, unsigned char *stack,
    extended_cif *ecif)
 {
-  int i;
+  ffi_cif *cif = ecif->cif;
+  void **avalue = ecif->avalue;
+  int i, nargs = cif->nargs;
   struct arg_state state;
 
-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+  arg_init (&state, cif->bytes);
 
-  for (i = 0; i < ecif->cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
+      ffi_type *ty = cif->arg_types[i];
+      size_t s = ty->size;
+      int h, t = ty->type;
+      void *a = avalue[i];
+
+      switch (t)
  {
  case FFI_TYPE_VOID:
   FFI_ASSERT (0);
@@ -710,82 +558,114 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 
  /* If the argument is a basic type the argument is allocated to an
    appropriate register, or if none are available, to the stack.  */
- case FFI_TYPE_FLOAT:
- case FFI_TYPE_DOUBLE:
- case FFI_TYPE_LONGDOUBLE:
+ case FFI_TYPE_INT:
  case FFI_TYPE_UINT8:
  case FFI_TYPE_SINT8:
  case FFI_TYPE_UINT16:
  case FFI_TYPE_SINT16:
  case FFI_TYPE_UINT32:
- case FFI_TYPE_INT:
  case FFI_TYPE_SINT32:
- case FFI_TYPE_POINTER:
  case FFI_TYPE_UINT64:
  case FFI_TYPE_SINT64:
-  copy_to_register_or_stack (context, stack, &state,
-     ecif->avalue[i], ty->type);
+ case FFI_TYPE_POINTER:
+ do_pointer:
+  {
+    ffi_arg ext = extend_integer_type (a, t);
+    if (state.ngrn < N_X_ARG_REG)
+      context->x[state.ngrn++] = ext;
+    else
+      {
+ void *d = allocate_to_stack (&state, stack, ty->alignment, s);
+ state.ngrn = N_X_ARG_REG;
+ /* Note that the default abi extends each argument
+   to a full 64-bit slot, while the iOS abi allocates
+   only enough space. */
+#ifdef __APPLE__
+ memcpy(d, a, s);
+#else
+ *(ffi_arg *)d = ext;
+#endif
+      }
+  }
   break;
 
- case FFI_TYPE_STRUCT:
-  if (is_hfa (ty))
-    {
-      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
- stack, &state);
-    }
-  else if (ty->size > 16)
-    {
-      /* If the argument is a composite type that is larger than 16
- bytes, then the argument has been copied to memory, and
- the argument is replaced by a pointer to the copy.  */
+ case FFI_TYPE_FLOAT:
+ case FFI_TYPE_DOUBLE:
+ case FFI_TYPE_LONGDOUBLE:
+  /* Scalar float is a degenerate case of HFA.  */
+  h = t + 0x100;
+  goto do_hfa;
 
-      copy_to_register_or_stack (context, stack, &state,
- &(ecif->avalue[i]), FFI_TYPE_POINTER);
-    }
-  else if (available_x (&state) >= (ty->size + 7) / 8)
-    {
-      /* If the argument is a composite type and the size in
- double-words is not more than the number of available
- X registers, then the argument is copied into consecutive
- X registers.  */
-      int j;
-      for (j = 0; j < (ty->size + 7) / 8; j++)
- {
-  memcpy (allocate_to_x (context, &state),
-  &(((UINT64 *) ecif->avalue[i])[j]),
-  sizeof (UINT64));
+ case FFI_TYPE_STRUCT:
+  {
+    void *dest;
+    int elems;
+
+    h = is_hfa (ty);
+    if (h)
+      {
+    do_hfa:
+ elems = h >> 8;
+        if (state.nsrn + elems <= N_V_ARG_REG)
+  {
+    dest = &context->v[state.nsrn];
+    state.nsrn += elems;
+    extend_hfa_type (dest, a, h);
+    break;
+  }
+ state.nsrn = N_V_ARG_REG;
+ dest = allocate_to_stack (&state, stack, ty->alignment, s);
+      }
+    else if (s > 16)
+      {
+ /* If the argument is a composite type that is larger than 16
+   bytes, then the argument has been copied to memory, and
+   the argument is replaced by a pointer to the copy.  */
+ a = &avalue[i];
+ t = FFI_TYPE_POINTER;
+ goto do_pointer;
+      }
+    else
+      {
+ size_t n = (s + 7) / 8;
+ if (state.ngrn + n <= N_X_ARG_REG)
+  {
+    /* If the argument is a composite type and the size in
+       double-words is not more than the number of available
+       X registers, then the argument is copied into
+       consecutive X registers.  */
+    dest = &context->x[state.ngrn];
+    state.ngrn += n;
+  }
+ else
+  {
+    /* Otherwise, there are insufficient X registers. Further
+       X register allocations are prevented, the NSAA is
+       adjusted and the argument is copied to memory at the
+       adjusted NSAA.  */
+    state.ngrn = N_X_ARG_REG;
+    dest = allocate_to_stack (&state, stack, ty->alignment, s);
+  }
  }
-    }
-  else
-    {
-      /* Otherwise, there are insufficient X registers. Further X
- register allocations are prevented, the NSAA is adjusted
- (by allocate_to_stack ()) and the argument is copied to
- memory at the adjusted NSAA.  */
-      state.ngrn = N_X_ARG_REG;
-
-      memcpy (allocate_to_stack (&state, stack, ty->alignment,
- ty->size), ecif->avalue + i, ty->size);
+      memcpy (dest, a, s);
     }
   break;
 
  default:
-  FFI_ASSERT (0);
-  break;
+  abort();
  }
 
 #if defined (__APPLE__)
-      if (i + 1 == ecif->cif->aarch64_nfixedargs)
+      if (i + 1 == cif->aarch64_nfixedargs)
  {
   state.ngrn = N_X_ARG_REG;
   state.nsrn = N_V_ARG_REG;
-
   state.allocating_variadic = 1;
  }
 #endif
     }
 
-  return ecif->cif->aarch64_flags;
+  return cif->aarch64_flags;
 }
 
 ffi_status
@@ -846,94 +726,61 @@ void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
-  int h;
+  struct call_context context;
+  size_t stack_bytes;
+  int h, t;
 
   ecif.cif = cif;
   ecif.avalue = avalue;
   ecif.rvalue = rvalue;
 
-  switch (cif->abi)
+  stack_bytes = cif->bytes;
+
+  memset (&context, 0, sizeof (context));
+  if (is_register_candidate (cif->rtype))
     {
-    case FFI_SYSV:
-      {
-        struct call_context context;
- size_t stack_bytes;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
 
- /* Figure out the total amount of stack space we need, the
-   above call frame space needs to be 16 bytes aligned to
-   ensure correct alignment of the first object inserted in
-   that space hence the ALIGN applied to cif->bytes.*/
- stack_bytes = ALIGN(cif->bytes, 16);
+      t = cif->rtype->type;
+      switch (t)
+ {
+ case FFI_TYPE_INT:
+ case FFI_TYPE_UINT8:
+ case FFI_TYPE_SINT8:
+ case FFI_TYPE_UINT16:
+ case FFI_TYPE_SINT16:
+ case FFI_TYPE_UINT32:
+ case FFI_TYPE_SINT32:
+ case FFI_TYPE_POINTER:
+ case FFI_TYPE_UINT64:
+ case FFI_TYPE_SINT64:
+  *(ffi_arg *)rvalue = extend_integer_type (&context.x[0], t);
+  break;
 
- memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
- {
-  void *addr = get_basic_type_addr (cif->rtype->type,
-    &context, 0);
-  copy_basic_type (rvalue, addr, cif->rtype->type);
-  break;
- }
+ case FFI_TYPE_FLOAT:
+ case FFI_TYPE_DOUBLE:
+ case FFI_TYPE_LONGDOUBLE:
+  compress_hfa_type (rvalue, &context.v[0], 0x100 + t);
+  break;
 
-              case FFI_TYPE_STRUCT:
- h = is_hfa (cif->rtype);
-                if (h)
-  {
-    int j;
-    int type = h & 0xff;
-    int elems = h >> 8;
-    for (j = 0; j < elems; j++)
-      {
- void *reg = get_basic_type_addr (type, &context, j);
- copy_basic_type (rvalue, reg, type);
- rvalue += get_basic_type_size (type);
-      }
-  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    size_t size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-    context.x8 = (uintptr_t)rvalue;
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-   stack_bytes, fn);
-          }
-        break;
-      }
+ case FFI_TYPE_STRUCT:
+  h = is_hfa (cif->rtype);
+  if (h)
+    compress_hfa_type (rvalue, &context.v[0], h);
+  else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
+    memcpy (rvalue, &context.x[0], cif->rtype->size);
+  else
+    abort();
+  break;
 
-    default:
-      FFI_ASSERT (0);
-      break;
+ default:
+  abort();
+ }
+    }
+  else
+    {
+      context.x8 = (uintptr_t)rvalue;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
     }
 }
 
@@ -1000,203 +847,158 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   void *rvalue = NULL;
-  int i, h;
+  int i, h, nargs = cif->nargs;
   struct arg_state state;
+  ffi_type *rtype;
 
   arg_init (&state, ALIGN(cif->bytes, 16));
 
-  for (i = 0; i < cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
+      int t = ty->type;
+      size_t n, s = ty->size;
 
-      switch (ty->type)
+      switch (t)
  {
  case FFI_TYPE_VOID:
   FFI_ASSERT (0);
   break;
 
+ case FFI_TYPE_INT:
  case FFI_TYPE_UINT8:
  case FFI_TYPE_SINT8:
  case FFI_TYPE_UINT16:
  case FFI_TYPE_SINT16:
  case FFI_TYPE_UINT32:
  case FFI_TYPE_SINT32:
- case FFI_TYPE_INT:
- case FFI_TYPE_POINTER:
  case FFI_TYPE_UINT64:
  case FFI_TYPE_SINT64:
+ case FFI_TYPE_POINTER:
+  avalue[i] = allocate_int_to_reg_or_stack (context, &state, stack, s);
+  break;
+
  case FFI_TYPE_FLOAT:
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
-  avalue[i] = allocate_to_register_or_stack (context, stack,
-     &state, ty->type);
-  break;
+  /* Scalar float is a degenerate case of HFA.  */
+  h = t + 0x100;
+  goto do_hfa;
 
  case FFI_TYPE_STRUCT:
   h = is_hfa (ty);
   if (h)
     {
-      unsigned n = h >> 8;
-      if (available_v (&state) < n)
+    do_hfa:
+      n = h >> 8;
+      if (state.nsrn + n <= N_V_ARG_REG)
  {
-  state.nsrn = N_V_ARG_REG;
-  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
- ty->size);
+  void *reg = &context->v[state.nsrn];
+  state.nsrn += n;
+
+  /* Eeek! We need a pointer to the structure, however the
+     homogeneous float elements are being passed in individual
+     registers, therefore for float and double the structure
+     is not represented as a contiguous sequence of bytes in
+     our saved register context.  We don't need the original
+     contents of the register storage, so we reformat the
+     structure into the same memory.  */
+  avalue[i] = compress_hfa_type (reg, reg, h);
  }
       else
  {
-  switch (h & 0xff)
-    {
-    case FFI_TYPE_FLOAT:
-      {
- /* Eeek! We need a pointer to the structure,
-   however the homogeneous float elements are
-   being passed in individual S registers,
-   therefore the structure is not represented as
-   a contiguous sequence of bytes in our saved
-   register context. We need to fake up a copy
-   of the structure laid out in memory
-   correctly. The fake can be tossed once the
-   closure function has returned hence alloca()
-   is sufficient. */
- unsigned j;
- UINT32 *p = avalue[i] = alloca (ty->size);
- for (j = 0; j < n; j++)
-  memcpy (&p[j],
-  allocate_to_s (context, &state),
-  sizeof (*p));
- break;
-      }
-
-    case FFI_TYPE_DOUBLE:
-      {
- /* Eeek! We need a pointer to the structure,
-   however the homogeneous float elements are
-   being passed in individual S registers,
-   therefore the structure is not represented as
-   a contiguous sequence of bytes in our saved
-   register context. We need to fake up a copy
-   of the structure laid out in memory
-   correctly. The fake can be tossed once the
-   closure function has returned hence alloca()
-   is sufficient. */
- unsigned j;
- UINT64 *p = avalue[i] = alloca (ty->size);
- for (j = 0; j < n; j++)
-  memcpy (&p[j],
-  allocate_to_d (context, &state),
-  sizeof (*p));
- break;
-      }
-
-    case FFI_TYPE_LONGDOUBLE:
-  memcpy (&avalue[i],
-  allocate_to_v (context, &state),
-  sizeof (*avalue));
-      break;
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
+  state.nsrn = N_V_ARG_REG;
+  avalue[i] = allocate_to_stack (&state, stack,
+ ty->alignment, s);
  }
     }
-  else if (ty->size > 16)
+  else if (s > 16)
     {
       /* Replace Composite type of size greater than 16 with a
  pointer.  */
-      memcpy (&avalue[i],
-      allocate_to_register_or_stack (context, stack,
-     &state, FFI_TYPE_POINTER),
-      sizeof (avalue[i]));
-    }
-  else if (available_x (&state) >= (ty->size + 7) / 8)
-    {
-      avalue[i] = get_x_addr (context, state.ngrn);
-      state.ngrn += (ty->size + 7) / 8;
+      avalue[i] = *(void **)
+ allocate_int_to_reg_or_stack (context, &state, stack,
+      sizeof (void *));
     }
   else
     {
-      state.ngrn = N_X_ARG_REG;
-
-      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-     ty->size);
+      n = (s + 7) / 8;
+      if (state.ngrn + n <= N_X_ARG_REG)
+ {
+  avalue[i] = &context->x[state.ngrn];
+  state.ngrn += n;
+ }
+      else
+ {
+  state.ngrn = N_X_ARG_REG;
+  avalue[i] = allocate_to_stack (&state, stack,
+ ty->alignment, s);
+ }
     }
   break;
 
  default:
-  FFI_ASSERT (0);
-  break;
+  abort();
  }
     }
 
-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
-
-  if (is_register_candidate (cif->rtype))
+  /* Figure out where the return value will be passed, either in registers
+     or in a memory block allocated by the caller and passed in x8.  */
+  rtype = cif->rtype;
+  if (is_register_candidate (rtype))
     {
+      size_t s = rtype->size;
+      int t;
+
       /* Register candidates are *always* returned in registers. */
 
       /* Allocate a scratchpad for the return value, we will let the
          callee scrible the result into the scratch pad then move the
          contents into the appropriate return value location for the
          call convention.  */
-      rvalue = alloca (cif->rtype->size);
+      rvalue = alloca (s);
       (closure->fun) (cif, rvalue, avalue, closure->user_data);
 
       /* Copy the return value into the call context so that it is returned
          as expected to our caller.  */
-      switch (cif->rtype->type)
+      t = rtype->type;
+      switch (t)
         {
         case FFI_TYPE_VOID:
           break;
 
+        case FFI_TYPE_INT:
         case FFI_TYPE_UINT8:
         case FFI_TYPE_UINT16:
         case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
         case FFI_TYPE_UINT64:
         case FFI_TYPE_SINT8:
         case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
         case FFI_TYPE_SINT32:
         case FFI_TYPE_SINT64:
+        case FFI_TYPE_POINTER:
+  context->x[0] = extend_integer_type (rvalue, t);
+          break;
+
         case FFI_TYPE_FLOAT:
         case FFI_TYPE_DOUBLE:
         case FFI_TYPE_LONGDOUBLE:
-  {
-    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-  }
+  extend_hfa_type (&context->v[0], rvalue, 0x100 + t);
+  break;
+
         case FFI_TYPE_STRUCT:
   h = is_hfa (cif->rtype);
           if (h)
-    {
-      int j;
-      int type = h & 0xff;
-      int elems = h >> 8;
-      for (j = 0; j < elems; j++)
- {
-  void *reg = get_basic_type_addr (type, context, j);
-  copy_basic_type (reg, rvalue, type);
-  rvalue += get_basic_type_size (type);
- }
-    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              size_t size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
+    extend_hfa_type (&context->v[0], rvalue, h);
           else
-            {
-              FFI_ASSERT (0);
+    {
+      FFI_ASSERT (s <= 16);
+              memcpy (&context->x[0], rvalue, s);
             }
           break;
+
         default:
-          FFI_ASSERT (0);
-          break;
+          abort();
         }
     }
   else
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

This lets us pass return data directly to the caller of ffi_call
in most cases, rather than storing it into temporary storage first.
---
 src/aarch64/ffi.c      | 202 ++++++++++++++++++++++++++++---------------------
 src/aarch64/internal.h |  43 ++++++++++-
 src/aarch64/sysv.S     | 127 ++++++++++++++++++++++++-------
 3 files changed, 258 insertions(+), 114 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index a067303..ffa1363 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -523,30 +523,90 @@ allocate_int_to_reg_or_stack (struct call_context *context,
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(cif->bytes, 16);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
+  ffi_type *rtype = cif->rtype;
+  size_t bytes = cif->bytes;
+  int flags, aarch64_flags, i, n;
 
-  if (is_v_register_candidate (cif->rtype))
+  switch (rtype->type)
     {
-      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-            break;
-          }
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = AARCH64_RET_UINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = AARCH64_RET_UINT16;
+      break;
+    case FFI_TYPE_UINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = AARCH64_RET_SINT8;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = AARCH64_RET_SINT16;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+      flags = AARCH64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64);
+      break;
+
+    case FFI_TYPE_FLOAT:
+      flags = AARCH64_RET_S1;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = AARCH64_RET_D1;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = AARCH64_RET_Q1;
+      break;
+
+    case FFI_TYPE_STRUCT:
+      {
+ int h = is_hfa (rtype);
+ size_t s = rtype->size;
+
+ if (h)
+  flags = (h & 0xff) * 4 + 4 - (h >> 8);
+ else if (s > 16)
+  {
+    flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+    bytes += 8;
+  }
+ else if (s == 16)
+  flags = AARCH64_RET_INT128;
+ else if (s == 8)
+  flags = AARCH64_RET_INT64;
+ else
+  flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+      }
+      break;
+
+    default:
+      abort();
     }
 
+  aarch64_flags = 0;
+  for (i = 0, n = cif->nargs; i < n; i++)
+    if (is_v_register_candidate (cif->arg_types[i]))
+      {
+ aarch64_flags = AARCH64_FLAG_ARG_V;
+ flags |= AARCH64_FLAG_ARG_V;
+ break;
+      }
+
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN(bytes, 16);
+  cif->flags = flags;
+  cif->aarch64_flags = aarch64_flags;
 #if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
 #endif
@@ -555,51 +615,65 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 }
 
 #if defined (__APPLE__)
-
 /* Perform Apple-specific cif processing for variadic calls */
 ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
     unsigned int nfixedargs,
     unsigned int ntotalargs)
 {
-  ffi_status status;
-
-  status = ffi_prep_cif_machdep (cif);
-
+  ffi_status status = ffi_prep_cif_machdep (cif);
   cif->aarch64_nfixedargs = nfixedargs;
-
   return status;
 }
+#endif /* __APPLE__ */
 
-#endif
-
-extern void ffi_call_SYSV (void *stack, void *frame,
-   void (*fn)(void), int flags) FFI_HIDDEN;
+extern void ffi_call_SYSV (struct call_context *context, void *frame,
+   void (*fn)(void), void *rvalue, int flags)
+ FFI_HIDDEN;
 
 /* Call a function with the provided arguments and capture the return
    value.  */
 void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 {
   struct call_context *context;
-  void *stack, *frame;
+  void *stack, *frame, *rvalue;
   struct arg_state state;
-  size_t stack_bytes;
-  int i, nargs = cif->nargs;
-  int h, t;
+  size_t stack_bytes, rtype_size, rsize;
+  int i, nargs, flags;
   ffi_type *rtype;
 
-  /* Allocate consectutive stack for everything we'll need.  */
+  flags = cif->flags;
+  rtype = cif->rtype;
+  rtype_size = rtype->size;
   stack_bytes = cif->bytes;
-  stack = alloca (stack_bytes + 32 + sizeof(struct call_context));
+
+  /* If the target function returns a structure via hidden pointer,
+     then we cannot allow a null rvalue.  Otherwise, mash a null
+     rvalue to void return type.  */
+  rsize = 0;
+  if (flags & AARCH64_RET_IN_MEM)
+    {
+      if (orig_rvalue == NULL)
+ rsize = rtype_size;
+    }
+  else if (orig_rvalue == NULL)
+    flags &= AARCH64_FLAG_ARG_V;
+  else if (flags & AARCH64_RET_NEED_COPY)
+    rsize = 16;
+
+  /* Allocate consectutive stack for everything we'll need.  */
+  context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize);
+  stack = context + 1;
   frame = stack + stack_bytes;
-  context = frame + 32;
+  rvalue = (rsize ? frame + 32 : orig_rvalue);
 
   arg_init (&state);
-  for (i = 0; i < nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       size_t s = ty->size;
       void *a = avalue[i];
+      int h, t;
 
       t = ty->type;
       switch (t)
@@ -717,54 +791,10 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 #endif
     }
 
-  rtype = cif->rtype;
-  if (is_register_candidate (rtype))
-    {
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
+  ffi_call_SYSV (context, frame, fn, rvalue, flags);
 
-      t = rtype->type;
-      switch (t)
- {
- case FFI_TYPE_INT:
- case FFI_TYPE_UINT8:
- case FFI_TYPE_SINT8:
- case FFI_TYPE_UINT16:
- case FFI_TYPE_SINT16:
- case FFI_TYPE_UINT32:
- case FFI_TYPE_SINT32:
- case FFI_TYPE_POINTER:
- case FFI_TYPE_UINT64:
- case FFI_TYPE_SINT64:
-  *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t);
-  break;
-
- case FFI_TYPE_FLOAT:
- case FFI_TYPE_DOUBLE:
- case FFI_TYPE_LONGDOUBLE:
-  compress_hfa_type (rvalue, &context->v[0], 0x100 + t);
-  break;
-
- case FFI_TYPE_STRUCT:
-  h = is_hfa (cif->rtype);
-  if (h)
-    compress_hfa_type (rvalue, &context->v[0], h);
-  else
-    {
-      FFI_ASSERT (rtype->size <= 16);
-      memcpy (rvalue, &context->x[0], rtype->size);
-    }
-  break;
-
- default:
-  FFI_ASSERT (0);
-  break;
- }
-    }
-  else
-    {
-      context->x8 = (uintptr_t)rvalue;
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
-    }
+  if (flags & AARCH64_RET_NEED_COPY)
+    memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
 static unsigned char trampoline [] =
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
index b6b6104..a3070db 100644
--- a/src/aarch64/internal.h
+++ b/src/aarch64/internal.h
@@ -18,7 +18,48 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
-#define AARCH64_FLAG_ARG_V_BIT 0
+#define AARCH64_RET_VOID 0
+#define AARCH64_RET_INT64 1
+#define AARCH64_RET_INT128 2
+
+#define AARCH64_RET_UNUSED3 3
+#define AARCH64_RET_UNUSED4 4
+#define AARCH64_RET_UNUSED5 5
+#define AARCH64_RET_UNUSED6 6
+#define AARCH64_RET_UNUSED7 7
+
+/* Note that FFI_TYPE_FLOAT == 2, _DOUBLE == 3, _LONGDOUBLE == 4,
+   so _S4 through _Q1 are layed out as (TYPE * 4) + (4 - COUNT).  */
+#define AARCH64_RET_S4 8
+#define AARCH64_RET_S3 9
+#define AARCH64_RET_S2 10
+#define AARCH64_RET_S1 11
+
+#define AARCH64_RET_D4 12
+#define AARCH64_RET_D3 13
+#define AARCH64_RET_D2 14
+#define AARCH64_RET_D1 15
+
+#define AARCH64_RET_Q4 16
+#define AARCH64_RET_Q3 17
+#define AARCH64_RET_Q2 18
+#define AARCH64_RET_Q1 19
+
+/* Note that each of the sub-64-bit integers gets two entries.  */
+#define AARCH64_RET_UINT8 20
+#define AARCH64_RET_UINT16 22
+#define AARCH64_RET_UINT32 24
+
+#define AARCH64_RET_SINT8 26
+#define AARCH64_RET_SINT16 28
+#define AARCH64_RET_SINT32 30
+
+#define AARCH64_RET_MASK 31
+
+#define AARCH64_RET_IN_MEM (1 << 5)
+#define AARCH64_RET_NEED_COPY (1 << 6)
+
+#define AARCH64_FLAG_ARG_V_BIT 7
 #define AARCH64_FLAG_ARG_V (1 << AARCH64_FLAG_ARG_V_BIT)
 
 #define N_X_ARG_REG 8
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index a5f636a..ba15663 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -40,9 +40,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 
  .text
- .align 2
+ .align 4
 
- .globl CNAME(ffi_call_SYSV)
+ .globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
  .type CNAME(ffi_call_SYSV), #function
  .hidden CNAME(ffi_call_SYSV)
@@ -50,14 +50,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
-      void (*fn)(void), int flags);
+      void (*fn)(void), void *rvalue, int flags);
 
    Therefore on entry we have:
 
    x0 stack
    x1 frame
    x2 fn
-   x3 flags
+   x3 rvalue
+   x4 flags
 */
 
  cfi_startproc
@@ -71,43 +72,111 @@ CNAME(ffi_call_SYSV):
  cfi_rel_offset (x29, 0)
  cfi_rel_offset (x30, 8)
 
- str w3, [x29, #16] /* save flags */
  mov x9, x2 /* save fn */
+ mov x8, x3 /* install structure return */
+ stp x3, x4, [x29, #16] /* save rvalue and flags */
 
  /* Load the vector argument passing registers, if necessary.  */
- tbz w3, #AARCH64_FLAG_ARG_V_BIT, 1f
- ldp     q0, q1, [x29, #32 + 0]
- ldp     q2, q3, [x29, #32 + 32]
- ldp     q4, q5, [x29, #32 + 64]
- ldp     q6, q7, [x29, #32 + 96]
+ tbz w4, #AARCH64_FLAG_ARG_V_BIT, 1f
+ ldp     q0, q1, [sp, #0]
+ ldp     q2, q3, [sp, #32]
+ ldp     q4, q5, [sp, #64]
+ ldp     q6, q7, [sp, #96]
 1:
  /* Load the core argument passing registers, including
    the structure return pointer.  */
- ldp     x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0]
- ldp     x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16]
- ldp     x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32]
- ldp     x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48]
- ldr     x8,     [x29, #32 + 16*N_V_ARG_REG + 64]
+ ldp     x0, x1, [sp, #16*N_V_ARG_REG + 0]
+ ldp     x2, x3, [sp, #16*N_V_ARG_REG + 16]
+ ldp     x4, x5, [sp, #16*N_V_ARG_REG + 32]
+ ldp     x6, x7, [sp, #16*N_V_ARG_REG + 48]
+
+ /* Deallocate the context, leaving the stacked arguments.  */
+ add sp, sp, #CALL_CONTEXT_SIZE
 
  blr     x9 /* call fn */
 
- ldr w3, [x29, #16] /* reload flags */
+ ldp x3, x4, [x29, #16] /* reload rvalue and flags */
 
  /* Partially deconstruct the stack frame.  */
  mov     sp, x29
  cfi_def_cfa_register (sp)
  ldp     x29, x30, [x29]
 
- /* Save the core return registers.  */
- stp     x0, x1, [sp, #32 + 16*N_V_ARG_REG]
-
- /* Save the vector return registers, if necessary.  */
- tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
- stp     q0, q1, [sp, #32 + 0]
- stp     q2, q3, [sp, #32 + 32]
-1:
- /* All done.  */
+ /* Save the return value as directed.  */
+ adr x5, 0f
+ and w4, w4, #AARCH64_RET_MASK
+ add x5, x5, x4, lsl #3
+ br x5
+
+ /* Note that each table entry is 2 insns, and thus 8 bytes.
+   For integer data, note that we're storing into ffi_arg
+   and therefore we want to extend to 64 bits; these types
+   have two consecutive entries allocated for them.  */
+ .align 4
+0: ret /* VOID */
+ nop
+1: str x0, [x3] /* INT64 */
+ ret
+2: stp x0, x1, [x3] /* INT128 */
+ ret
+3: brk #1000 /* UNUSED */
+ ret
+4: brk #1000 /* UNUSED */
+ ret
+5: brk #1000 /* UNUSED */
+ ret
+6: brk #1000 /* UNUSED */
+ ret
+7: brk #1000 /* UNUSED */
+ ret
+8: st4 { v0.s-v3.s }[0], [x3] /* S4 */
+ ret
+9: st3 { v0.s-v2.s }[0], [x3] /* S3 */
  ret
+10: stp s0, s1, [x3] /* S2 */
+ ret
+11: str s0, [x3] /* S1 */
+ ret
+12: st4 { v0.d-v3.d }[0], [x3] /* D4 */
+ ret
+13: st3 { v0.d-v2.d }[0], [x3] /* D3 */
+ ret
+14: stp d0, d1, [x3] /* D2 */
+ ret
+15: str d0, [x3] /* D1 */
+ ret
+16: str q3, [x3, #48] /* Q4 */
+ nop
+17: str q2, [x3, #32] /* Q3 */
+ nop
+18: stp q0, q1, [x3] /* Q2 */
+ ret
+19: str q0, [x3] /* Q1 */
+ ret
+20: uxtb w0, w0 /* UINT8 */
+ str x0, [x3]
+21: ret /* reserved */
+ nop
+22: uxth w0, w0 /* UINT16 */
+ str x0, [x3]
+23: ret /* reserved */
+ nop
+24: mov w0, w0 /* UINT32 */
+ str x0, [x3]
+25: ret /* reserved */
+ nop
+26: sxtb x0, w0 /* SINT8 */
+ str x0, [x3]
+27: ret /* reserved */
+ nop
+28: sxth x0, w0 /* SINT16 */
+ str x0, [x3]
+29: ret /* reserved */
+ nop
+30: sxtw x0, w0 /* SINT32 */
+ str x0, [x3]
+31: ret /* reserved */
+ nop
 
  cfi_endproc
 #ifdef __ELF__
@@ -154,9 +223,13 @@ CNAME(ffi_call_SYSV):
    Voila!  */
 
         .text
-        .align 2
+        .align 4
 
-        .globl CNAME(ffi_closure_SYSV)
+        .globl CNAME(ffi_closure_SYSV)
+#ifdef __ELF__
+ .type CNAME(ffi_closure_SYSV), #function
+ .hidden CNAME(ffi_closure_SYSV)
+#endif
         cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 11/16] aarch64: Move return value handling into ffi_closure_SYSV

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

As with the change to ffi_call_SYSV, this avoids copying data
into a temporary buffer.
---
 src/aarch64/ffi.c       | 196 +++++++------------------------------
 src/aarch64/ffitarget.h |   2 +-
 src/aarch64/sysv.S      | 249 +++++++++++++++++++++++++++---------------------
 3 files changed, 176 insertions(+), 271 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index ffa1363..c5a429a 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,9 +71,6 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-extern void
-ffi_closure_SYSV (ffi_closure *);
-
 /* Test for an FFI floating point representation.  */
 
 static unsigned
@@ -211,69 +208,6 @@ is_hfa(const ffi_type *ty)
   return (ele_count << 8) | candidate;
 }
 
-/* Test if an ffi_type is a candidate for passing in a register.
-
-   This test does not check that sufficient registers of the
-   appropriate class are actually available, merely that IFF
-   sufficient registers are available then the argument will be passed
-   in register(s).
-
-   Note that an ffi_type that is deemed to be a register candidate
-   will always be returned in registers.
-
-   Returns 1 if a register candidate else 0.  */
-
-static int
-is_register_candidate (ffi_type *ty)
-{
-  switch (ty->type)
-    {
-    case FFI_TYPE_VOID:
-      return 0;
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-    case FFI_TYPE_LONGDOUBLE:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT64:
-      return 1;
-
-    case FFI_TYPE_STRUCT:
-      if (is_hfa (ty))
-        {
-          return 1;
-        }
-      else if (ty->size > 16)
-        {
-          /* Too large. Will be replaced with a pointer to memory. The
-             pointer MAY be passed in a register, but the value will
-             not. This test specifically fails since the argument will
-             never be passed by value in registers. */
-          return 0;
-        }
-      else
-        {
-          /* Might be passed in registers depending on the number of
-             registers required. */
-          return (ty->size + 7) / 8 < N_X_ARG_REG;
-        }
-      break;
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-
-  return 0;
-}
-
 /* Test if an ffi_type argument or result is a candidate for a vector
    register.  */
 
@@ -797,42 +731,42 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
     memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
-static unsigned char trampoline [] =
-{ 0x70, 0x00, 0x00, 0x58, /* ldr x16, 1f */
-  0x91, 0x00, 0x00, 0x10, /* adr x17, 2f */
-  0x00, 0x02, 0x1f, 0xd6 /* br x16 */
-};
-
 /* Build a trampoline.  */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS) \
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP); \
-    UINT64  __fun = (UINT64)(FUN); \
-    UINT64  __ctx = (UINT64)(CTX); \
-    UINT64  __flags = (UINT64)(FLAGS); \
-    memcpy (__tramp, trampoline, sizeof (trampoline)); \
-    memcpy (__tramp + 12, &__fun, sizeof (__fun)); \
-    memcpy (__tramp + 20, &__ctx, sizeof (__ctx)); \
-    memcpy (__tramp + 28, &__flags, sizeof (__flags)); \
-    ffi_clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE); \
-  })
+extern void ffi_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
 
 ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
+ffi_prep_closure_loc (ffi_closure *closure,
                       ffi_cif* cif,
                       void (*fun)(ffi_cif*,void*,void**,void*),
                       void *user_data,
                       void *codeloc)
 {
+  static const unsigned char trampoline[16] = {
+    0x90, 0x00, 0x00, 0x58, /* ldr x16, tramp+16 */
+    0xf1, 0xff, 0xff, 0x10, /* adr x17, tramp+0 */
+    0x00, 0x02, 0x1f, 0xd6 /* br x16 */
+  };
+  char *tramp = closure->tramp;
+  void (*start)(void);
+
   if (cif->abi != FFI_SYSV)
     return FFI_BAD_ABI;
 
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
-       cif->aarch64_flags);
-
-  closure->cif  = cif;
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_closure_SYSV_V;
+  else
+    start = ffi_closure_SYSV;
+  *(UINT64 *)(tramp + 16) = (uintptr_t)start;
+
+  ffi_clear_cache(tramp, tramp + FFI_TRAMPOLINE_SIZE);
 
   return FFI_OK;
 }
@@ -853,20 +787,20 @@ ffi_prep_closure_loc (ffi_closure* closure,
    descriptors, invokes the wrapped function, then marshalls the return
    value back into the call context.  */
 
-void FFI_HIDDEN
-ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
- void *stack)
+int FFI_HIDDEN
+ffi_closure_SYSV_inner (ffi_cif *cif,
+ void (*fun)(ffi_cif*,void*,void**,void*),
+ void *user_data,
+ struct call_context *context,
+ void *stack, void *rvalue)
 {
-  ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  void *rvalue = NULL;
-  int i, h, nargs = cif->nargs;
+  int i, h, nargs, flags;
   struct arg_state state;
-  ffi_type *rtype;
 
   arg_init (&state);
 
-  for (i = 0; i < nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       int t = ty->type;
@@ -955,69 +889,11 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
  }
     }
 
-  /* Figure out where the return value will be passed, either in registers
-     or in a memory block allocated by the caller and passed in x8.  */
-  rtype = cif->rtype;
-  if (is_register_candidate (rtype))
-    {
-      size_t s = rtype->size;
-      int t;
-
-      /* Register candidates are *always* returned in registers. */
-
-      /* Allocate a scratchpad for the return value, we will let the
-         callee scrible the result into the scratch pad then move the
-         contents into the appropriate return value location for the
-         call convention.  */
-      rvalue = alloca (s);
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-
-      /* Copy the return value into the call context so that it is returned
-         as expected to our caller.  */
-      t = rtype->type;
-      switch (t)
-        {
-        case FFI_TYPE_VOID:
-          break;
-
-        case FFI_TYPE_INT:
-        case FFI_TYPE_UINT8:
-        case FFI_TYPE_UINT16:
-        case FFI_TYPE_UINT32:
-        case FFI_TYPE_UINT64:
-        case FFI_TYPE_SINT8:
-        case FFI_TYPE_SINT16:
-        case FFI_TYPE_SINT32:
-        case FFI_TYPE_SINT64:
-        case FFI_TYPE_POINTER:
-  context->x[0] = extend_integer_type (rvalue, t);
-          break;
-
-        case FFI_TYPE_FLOAT:
-        case FFI_TYPE_DOUBLE:
-        case FFI_TYPE_LONGDOUBLE:
-  extend_hfa_type (&context->v[0], rvalue, 0x100 + t);
-  break;
+  flags = cif->flags;
+  if (flags & AARCH64_RET_IN_MEM)
+    rvalue = (void *)(uintptr_t)context->x8;
 
-        case FFI_TYPE_STRUCT:
-  h = is_hfa (cif->rtype);
-          if (h)
-    extend_hfa_type (&context->v[0], rvalue, h);
-          else
-    {
-      FFI_ASSERT (s <= 16);
-              memcpy (&context->x[0], rvalue, s);
-            }
-          break;
+  fun (cif, rvalue, avalue, user_data);
 
-        default:
-          abort();
-        }
-    }
-  else
-    {
-      rvalue = (void *)(uintptr_t)context->x8;
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-    }
+  return flags;
 }
-
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 336f28a..b488bbe 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -42,7 +42,7 @@ typedef enum ffi_abi
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 36
+#define FFI_TRAMPOLINE_SIZE 24
 #define FFI_NATIVE_RAW_API 0
 
 /* ---- Internal ---- */
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index ba15663..abd848d 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -39,15 +39,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 #endif
 
+#ifdef __AARCH64EB__
+# define BE(X) X
+#else
+# define BE(X) 0
+#endif
+
  .text
  .align 4
 
- .globl CNAME(ffi_call_SYSV)
-#ifdef __ELF__
- .type CNAME(ffi_call_SYSV), #function
- .hidden CNAME(ffi_call_SYSV)
-#endif
-
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
       void (*fn)(void), void *rvalue, int flags);
@@ -179,131 +179,160 @@ CNAME(ffi_call_SYSV):
  nop
 
  cfi_endproc
+
+ .globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
-        .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
+ .type CNAME(ffi_call_SYSV), #function
+ .hidden CNAME(ffi_call_SYSV)
+ .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
 
-#define ffi_closure_SYSV_FS (8 * 2 + CALL_CONTEXT_SIZE)
-
 /* ffi_closure_SYSV
 
    Closure invocation glue. This is the low level code invoked directly by
    the closure trampoline to setup and call a closure.
 
-   On entry x17 points to a struct trampoline_data, x16 has been clobbered
+   On entry x17 points to a struct ffi_closure, x16 has been clobbered
    all other registers are preserved.
 
    We allocate a call context and save the argument passing registers,
    then invoked the generic C ffi_closure_SYSV_inner() function to do all
    the real work, on return we load the result passing registers back from
    the call context.
+*/
 
-   On entry
-
-   extern void
-   ffi_closure_SYSV (struct trampoline_data *);
-
-   struct trampoline_data
-   {
-        UINT64 *ffi_closure;
-        UINT64 flags;
-   };
-
-   This function uses the following stack frame layout:
-
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x22
-                saved x21
-                ...
-   sp     ->    call_context
-   ==
+#define ffi_closure_SYSV_FS (8*2 + CALL_CONTEXT_SIZE + 64)
 
-   Voila!  */
+ .align 4
+CNAME(ffi_closure_SYSV_V):
+ cfi_startproc
+ stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+ cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
 
-        .text
-        .align 4
+ /* Save the argument passing vector registers.  */
+ stp     q0, q1, [sp, #16 + 0]
+ stp     q2, q3, [sp, #16 + 32]
+ stp     q4, q5, [sp, #16 + 64]
+ stp     q6, q7, [sp, #16 + 96]
+ b 0f
+ cfi_endproc
 
-        .globl CNAME(ffi_closure_SYSV)
+ .globl CNAME(ffi_closure_SYSV_V)
 #ifdef __ELF__
- .type CNAME(ffi_closure_SYSV), #function
- .hidden CNAME(ffi_closure_SYSV)
+ .type CNAME(ffi_closure_SYSV_V), #function
+ .hidden CNAME(ffi_closure_SYSV_V)
+ .size CNAME(ffi_closure_SYSV_V), . - CNAME(ffi_closure_SYSV_V)
 #endif
-        cfi_startproc
-CNAME(ffi_closure_SYSV):
-        stp     x29, x30, [sp, #-16]!
- cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-        cfi_def_cfa_register (x29)
-
-        sub     sp, sp, #ffi_closure_SYSV_FS
-
-        stp     x21, x22, [x29, #-16]
-        cfi_rel_offset (x21, -16)
-        cfi_rel_offset (x22, -8)
-
-        /* Load x21 with &call_context.  */
-        mov     x21, sp
-        /* Preserve our struct trampoline_data *  */
-        mov     x22, x17
-
-        /* Save the rest of the argument passing registers, including
-   the structure return pointer.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        str     x8,     [x21, #16*N_V_ARG_REG + 64]
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #0]
-        stp     q2, q3, [x21, #32]
-        stp     q4, q5, [x21, #64]
-        stp     q6, q7, [x21, #96]
-1:
-        /* Load &ffi_closure..  */
-        ldr     x0, [x22, #0]
-        mov     x1, x21
-        /* Compute the location of the stack at the point that the
-           trampoline was called.  */
-        add     x2, x29, #16
-
-        bl      CNAME(ffi_closure_SYSV_inner)
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #0]
-        ldp     q2, q3, [x21, #32]
-1:
-        /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-
-        /* We are done, unwind our frame.  */
-        ldp     x21, x22, [x29,  #-16]
-        cfi_restore (x21)
-        cfi_restore (x22)
 
-        mov     sp, x29
-        cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
- cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
+ .align 4
+ cfi_startproc
+CNAME(ffi_closure_SYSV):
+ stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+ cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
+0:
+ mov     x29, sp
+
+ /* Save the argument passing core registers.  */
+ stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+ stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+ stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+ stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+ str     x8,     [sp, #16 + 16*N_V_ARG_REG + 64]
+
+ /* Load ffi_closure_inner arguments.  */
+ ldp x0, x1, [x17, #FFI_TRAMPOLINE_SIZE] /* load cif, fn */
+ ldr x2, [x17, #FFI_TRAMPOLINE_SIZE+16] /* load user_data */
+ add x3, sp, #16 /* load context */
+ add x4, sp, #ffi_closure_SYSV_FS /* load stack */
+ add x5, sp, #16+CALL_CONTEXT_SIZE /* load rvalue */
+ bl      CNAME(ffi_closure_SYSV_inner)
+
+ /* Load the return value as directed.  */
+ adr x1, 0f
+ and w0, w0, #AARCH64_RET_MASK
+ add x1, x1, x0, lsl #3
+ add x3, sp, #16+CALL_CONTEXT_SIZE
+ br x1
+
+ /* Note that each table entry is 2 insns, and thus 8 bytes.  */
+ .align 4
+0: b 99f /* VOID */
+ nop
+1: ldr x0, [x3] /* INT64 */
+ b 99f
+2: ldp x0, x1, [x3] /* INT128 */
+ b 99f
+3: brk #1000 /* UNUSED */
+ nop
+4: brk #1000 /* UNUSED */
+ nop
+5: brk #1000 /* UNUSED */
+ nop
+6: brk #1000 /* UNUSED */
+ nop
+7: brk #1000 /* UNUSED */
+ nop
+8: ldr s3, [x3, #12] /* S4 */
+ nop
+9: ldr s2, [x2, #8] /* S3 */
+ nop
+10: ldp s0, s1, [x3] /* S2 */
+ b 99f
+11: ldr s0, [x3] /* S1 */
+ b 99f
+12: ldr d3, [x3, #24] /* D4 */
+ nop
+13: ldr d2, [x3, #16] /* D3 */
+ nop
+14: ldp d0, d1, [x3] /* D2 */
+ b 99f
+15: ldr d0, [x3] /* D1 */
+ b 99f
+16: ldr q3, [x3, #48] /* Q4 */
+ nop
+17: ldr q2, [x3, #32] /* Q3 */
+ nop
+18: ldp q0, q1, [x3] /* Q2 */
+ b 99f
+19: ldr q0, [x3] /* Q1 */
+ b 99f
+20: ldrb w0, [x3, #BE(7)] /* UINT8 */
+ b 99f
+21: brk #1000 /* reserved */
+ nop
+22: ldrh w0, [x3, #BE(6)] /* UINT16 */
+ b 99f
+23: brk #1000 /* reserved */
+ nop
+24: ldr w0, [x3, #BE(4)] /* UINT32 */
+ b 99f
+25: brk #1000 /* reserved */
+ nop
+26: ldrsb x0, [x3, #BE(7)] /* SINT8 */
+ b 99f
+27: brk #1000 /* reserved */
+ nop
+28: ldrsh x0, [x3, #BE(6)] /* SINT16 */
+ b 99f
+29: brk #1000 /* reserved */
+ nop
+30: ldrsw x0, [x3, #BE(4)] /* SINT32 */
+ nop
+31: /* reserved */
+99: ldp     x29, x30, [sp], #ffi_closure_SYSV_FS
+ cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS)
+ cfi_restore (x29)
+ cfi_restore (x30)
+ ret
  cfi_endproc
+
+ .globl CNAME(ffi_closure_SYSV)
 #ifdef __ELF__
-        .size CNAME(ffi_closure_SYSV), .-CNAME(ffi_closure_SYSV)
+ .type CNAME(ffi_closure_SYSV), #function
+ .hidden CNAME(ffi_closure_SYSV)
+ .size CNAME(ffi_closure_SYSV), . - CNAME(ffi_closure_SYSV)
 #endif
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 12/16] aarch64: Unify scalar fp and hfa handling

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

Since an HFA of a single element is exactly the same as scalar,
this tidies things up a bit.
---
 src/aarch64/ffi.c | 225 ++++++++++++++++++++++--------------------------------
 1 file changed, 91 insertions(+), 134 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index c5a429a..f69c350 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,16 +71,7 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-/* Test for an FFI floating point representation.  */
-
-static unsigned
-is_floating_type (unsigned short type)
-{
-  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
-  || type == FFI_TYPE_LONGDOUBLE);
-}
-
-/* A subroutine of is_hfa.  Given a structure type, return the type code
+/* A subroutine of is_vfp_type.  Given a structure type, return the type code
    of the first non-structure element.  Recurse for structure elements.
    Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
@@ -106,7 +97,7 @@ is_hfa0 (const ffi_type *ty)
   return ret;
 }
 
-/* A subroutine of is_hfa.  Given a structure type, return true if all
+/* A subroutine of is_vfp_type.  Given a structure type, return true if all
    of the non-structure elements are the same as CANDIDATE.  */
 
 static int
@@ -131,23 +122,35 @@ is_hfa1 (const ffi_type *ty, int candidate)
   return 1;
 }
 
-/* Determine if TY is an homogenous floating point aggregate (HFA).
+/* Determine if TY may be allocated to the FP registers.  This is both an
+   fp scalar type as well as an homogenous floating point aggregate (HFA).
    That is, a structure consisting of 1 to 4 members of all the same type,
-   where that type is a floating point scalar.
+   where that type is an fp scalar.
 
-   Returns non-zero iff TY is an HFA.  The result is an encoded value where
-   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
+   Returns non-zero iff TY is an HFA.  The result is the AARCH64_RET_*
+   constant for the type.  */
 
 static int
-is_hfa(const ffi_type *ty)
+is_vfp_type (const ffi_type *ty)
 {
   ffi_type **elements;
   int candidate, i;
   size_t size, ele_count;
 
   /* Quickest tests first.  */
-  if (ty->type != FFI_TYPE_STRUCT)
-    return 0;
+  switch (ty->type)
+    {
+    default:
+      return 0;
+    case FFI_TYPE_FLOAT:
+      return AARCH64_RET_S1;
+    case FFI_TYPE_DOUBLE:
+      return AARCH64_RET_D1;
+    case FFI_TYPE_LONGDOUBLE:
+      return AARCH64_RET_Q1;
+    case FFI_TYPE_STRUCT:
+      break;
+    }
 
   /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
   size = ty->size;
@@ -205,17 +208,7 @@ is_hfa(const ffi_type *ty)
     }
 
   /* All tests succeeded.  Encode the result.  */
-  return (ele_count << 8) | candidate;
-}
-
-/* Test if an ffi_type argument or result is a candidate for a vector
-   register.  */
-
-static int
-is_v_register_candidate (ffi_type *ty)
-{
-  return is_floating_type (ty->type)
-   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
+  return candidate * 4 + (4 - ele_count);
 }
 
 /* Representation of the procedure call argument marshalling
@@ -302,9 +295,7 @@ extend_integer_type (void *source, int type)
 static void
 extend_hfa_type (void *dest, void *src, int h)
 {
-  int n = (h >> 8);
-  int t = h & 0xff;
-  int f = (t - FFI_TYPE_FLOAT) * 4 + 4 - n;
+  int f = h - AARCH64_RET_S4;
   void *x0;
 
   asm volatile (
@@ -358,82 +349,68 @@ extend_hfa_type (void *dest, void *src, int h)
 static void *
 compress_hfa_type (void *dest, void *reg, int h)
 {
-  int n = h >> 8;
-  switch (h & 0xff)
+  switch (h)
     {
-    case FFI_TYPE_FLOAT:
-      switch (n)
+    case AARCH64_RET_S1:
+      if (dest == reg)
  {
- default:
-  if (dest == reg)
-    {
 #ifdef __AARCH64EB__
-      dest += 12;
+  dest += 12;
 #endif
-    }
-  else
-    *(float *)dest = *(float *)reg;
-  break;
- case 2:
-  asm("ldp q16, q17, [%1]\n\t"
-      "st2 { v16.s, v17.s }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
-  break;
- case 3:
-  asm("ldp q16, q17, [%1]\n\t"
-      "ldr q18, [%1, #32]\n\t"
-      "st3 { v16.s, v17.s, v18.s }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
-  break;
- case 4:
-  asm("ldp q16, q17, [%1]\n\t"
-      "ldp q18, q19, [%1, #32]\n\t"
-      "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
-  break;
  }
+      else
+ *(float *)dest = *(float *)reg;
+      break;
+    case AARCH64_RET_S2:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "st2 { v16.s, v17.s }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_S3:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "ldr q18, [%1, #32]\n\t"
+   "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_S4:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "ldp q18, q19, [%1, #32]\n\t"
+   "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
       break;
 
-    case FFI_TYPE_DOUBLE:
-      switch (n)
+    case AARCH64_RET_D1:
+      if (dest == reg)
  {
- default:
-  if (dest == reg)
-    {
 #ifdef __AARCH64EB__
-      dest += 8;
+  dest += 8;
 #endif
-    }
-  else
-    *(double *)dest = *(double *)reg;
-  break;
- case 2:
-  asm("ldp q16, q17, [%1]\n\t"
-      "st2 { v16.d, v17.d }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
-  break;
- case 3:
-  asm("ldp q16, q17, [%1]\n\t"
-      "ldr q18, [%1, #32]\n\t"
-      "st3 { v16.d, v17.d, v18.d }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
-  break;
- case 4:
-  asm("ldp q16, q17, [%1]\n\t"
-      "ldp q18, q19, [%1, #32]\n\t"
-      "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
-      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
-  break;
  }
+      else
+ *(double *)dest = *(double *)reg;
       break;
-
-    case FFI_TYPE_LONGDOUBLE:
-      if (dest != reg)
- return memcpy (dest, reg, 16 * n);
+    case AARCH64_RET_D2:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "st2 { v16.d, v17.d }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_D3:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "ldr q18, [%1, #32]\n\t"
+   "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_D4:
+      asm ("ldp q16, q17, [%1]\n\t"
+   "ldp q18, q19, [%1, #32]\n\t"
+   "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
       break;
 
     default:
-      FFI_ASSERT (0);
+      if (dest != reg)
+ return memcpy (dest, reg, 16 * (4 - (h & 3)));
+      break;
     }
   return dest;
 }
@@ -494,34 +471,25 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       break;
 
     case FFI_TYPE_FLOAT:
-      flags = AARCH64_RET_S1;
-      break;
     case FFI_TYPE_DOUBLE:
-      flags = AARCH64_RET_D1;
-      break;
     case FFI_TYPE_LONGDOUBLE:
-      flags = AARCH64_RET_Q1;
-      break;
-
     case FFI_TYPE_STRUCT:
-      {
- int h = is_hfa (rtype);
- size_t s = rtype->size;
-
- if (h)
-  flags = (h & 0xff) * 4 + 4 - (h >> 8);
- else if (s > 16)
-  {
-    flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
-    bytes += 8;
-  }
- else if (s == 16)
-  flags = AARCH64_RET_INT128;
- else if (s == 8)
-  flags = AARCH64_RET_INT64;
- else
-  flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
-      }
+      flags = is_vfp_type (rtype);
+      if (flags == 0)
+ {
+  size_t s = rtype->size;
+  if (s > 16)
+    {
+      flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+      bytes += 8;
+    }
+  else if (s == 16)
+    flags = AARCH64_RET_INT128;
+  else if (s == 8)
+    flags = AARCH64_RET_INT64;
+  else
+    flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+ }
       break;
 
     default:
@@ -530,7 +498,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
   aarch64_flags = 0;
   for (i = 0, n = cif->nargs; i < n; i++)
-    if (is_v_register_candidate (cif->arg_types[i]))
+    if (is_vfp_type (cif->arg_types[i]))
       {
  aarch64_flags = AARCH64_FLAG_ARG_V;
  flags |= AARCH64_FLAG_ARG_V;
@@ -652,20 +620,14 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
  case FFI_TYPE_FLOAT:
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
-  /* Scalar float is a degenerate case of HFA.  */
-  h = t + 0x100;
-  goto do_hfa;
-
  case FFI_TYPE_STRUCT:
   {
     void *dest;
-    int elems;
 
-    h = is_hfa (ty);
+    h = is_vfp_type (ty);
     if (h)
       {
-    do_hfa:
- elems = h >> 8;
+ int elems = 4 - (h & 3);
         if (state.nsrn + elems <= N_V_ARG_REG)
   {
     dest = &context->v[state.nsrn];
@@ -828,16 +790,11 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
  case FFI_TYPE_FLOAT:
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
-  /* Scalar float is a degenerate case of HFA.  */
-  h = t + 0x100;
-  goto do_hfa;
-
  case FFI_TYPE_STRUCT:
-  h = is_hfa (ty);
+  h = is_vfp_type (ty);
   if (h)
     {
-    do_hfa:
-      n = h >> 8;
+      n = 4 - (h & 3);
       if (state.nsrn + n <= N_V_ARG_REG)
  {
   void *reg = &context->v[state.nsrn];
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 13/16] aarch64: Remove aarch64_flags

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

This field was useless from the start, since the normal flags
field is available for backend use.
---
 src/aarch64/ffi.c       | 5 +----
 src/aarch64/ffitarget.h | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f69c350..b3e0b16 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -436,7 +436,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 {
   ffi_type *rtype = cif->rtype;
   size_t bytes = cif->bytes;
-  int flags, aarch64_flags, i, n;
+  int flags, i, n;
 
   switch (rtype->type)
     {
@@ -496,11 +496,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       abort();
     }
 
-  aarch64_flags = 0;
   for (i = 0, n = cif->nargs; i < n; i++)
     if (is_vfp_type (cif->arg_types[i]))
       {
- aarch64_flags = AARCH64_FLAG_ARG_V;
  flags |= AARCH64_FLAG_ARG_V;
  break;
       }
@@ -508,7 +506,6 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   /* Round the stack up to a multiple of the stack alignment requirement. */
   cif->bytes = ALIGN(bytes, 16);
   cif->flags = flags;
-  cif->aarch64_flags = aarch64_flags;
 #if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
 #endif
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index b488bbe..6d6d3e6 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -49,9 +49,7 @@ typedef enum ffi_abi
 
 #if defined (__APPLE__)
 #define FFI_TARGET_SPECIFIC_VARIADIC
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags; unsigned aarch64_nfixedargs
-#else
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
+#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
 #endif
 
 #endif
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 14/16] aarch64: Add support for complex types

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

---
 src/aarch64/ffi.c              | 34 +++++++++++++++++++++++++---------
 src/aarch64/ffitarget.h        |  2 ++
 testsuite/libffi.call/call.exp | 10 +++-------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index b3e0b16..4f85140 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -85,7 +85,7 @@ is_hfa0 (const ffi_type *ty)
     for (i = 0; elements[i]; ++i)
       {
         ret = elements[i]->type;
-        if (ret == FFI_TYPE_STRUCT)
+        if (ret == FFI_TYPE_STRUCT || ret == FFI_TYPE_COMPLEX)
           {
             ret = is_hfa0 (elements[i]);
             if (ret < 0)
@@ -110,7 +110,7 @@ is_hfa1 (const ffi_type *ty, int candidate)
     for (i = 0; elements[i]; ++i)
       {
         int t = elements[i]->type;
-        if (t == FFI_TYPE_STRUCT)
+        if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
           {
             if (!is_hfa1 (elements[i], candidate))
               return 0;
@@ -138,16 +138,27 @@ is_vfp_type (const ffi_type *ty)
   size_t size, ele_count;
 
   /* Quickest tests first.  */
-  switch (ty->type)
+  candidate = ty->type;
+  switch (candidate)
     {
     default:
       return 0;
     case FFI_TYPE_FLOAT:
-      return AARCH64_RET_S1;
     case FFI_TYPE_DOUBLE:
-      return AARCH64_RET_D1;
     case FFI_TYPE_LONGDOUBLE:
-      return AARCH64_RET_Q1;
+      ele_count = 1;
+      goto done;
+    case FFI_TYPE_COMPLEX:
+      candidate = ty->elements[0]->type;
+      switch (candidate)
+ {
+ case FFI_TYPE_FLOAT:
+ case FFI_TYPE_DOUBLE:
+ case FFI_TYPE_LONGDOUBLE:
+  ele_count = 2;
+  goto done;
+ }
+      return 0;
     case FFI_TYPE_STRUCT:
       break;
     }
@@ -160,7 +171,7 @@ is_vfp_type (const ffi_type *ty)
   /* Find the type of the first non-structure member.  */
   elements = ty->elements;
   candidate = elements[0]->type;
-  if (candidate == FFI_TYPE_STRUCT)
+  if (candidate == FFI_TYPE_STRUCT || candidate == FFI_TYPE_COMPLEX)
     {
       for (i = 0; ; ++i)
         {
@@ -198,16 +209,18 @@ is_vfp_type (const ffi_type *ty)
   /* Finally, make sure that all scalar elements are the same type.  */
   for (i = 0; elements[i]; ++i)
     {
-      if (elements[i]->type == FFI_TYPE_STRUCT)
+      int t = elements[i]->type;
+      if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
         {
           if (!is_hfa1 (elements[i], candidate))
             return 0;
         }
-      else if (elements[i]->type != candidate)
+      else if (t != candidate)
         return 0;
     }
 
   /* All tests succeeded.  Encode the result.  */
+ done:
   return candidate * 4 + (4 - ele_count);
 }
 
@@ -474,6 +487,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     case FFI_TYPE_DOUBLE:
     case FFI_TYPE_LONGDOUBLE:
     case FFI_TYPE_STRUCT:
+    case FFI_TYPE_COMPLEX:
       flags = is_vfp_type (rtype);
       if (flags == 0)
  {
@@ -618,6 +632,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
  case FFI_TYPE_STRUCT:
+ case FFI_TYPE_COMPLEX:
   {
     void *dest;
 
@@ -788,6 +803,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
  case FFI_TYPE_DOUBLE:
  case FFI_TYPE_LONGDOUBLE:
  case FFI_TYPE_STRUCT:
+ case FFI_TYPE_COMPLEX:
   h = is_vfp_type (ty);
   if (h)
     {
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 6d6d3e6..7461386 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -52,4 +52,6 @@ typedef enum ffi_abi
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
 #endif
 
+#define FFI_TARGET_HAS_COMPLEX_TYPE
+
 #endif
diff --git a/testsuite/libffi.call/call.exp b/testsuite/libffi.call/call.exp
index 5177f07..ceacd49 100644
--- a/testsuite/libffi.call/call.exp
+++ b/testsuite/libffi.call/call.exp
@@ -24,16 +24,12 @@ set ctlist [lsearch -inline -all -glob [lsort [glob -nocomplain -- $srcdir/$subd
 
 run-many-tests $tlist ""
 
-if { ![istarget s390*] } {
-
+if { [istarget s390*] || [istarget aarch64*] } {
+    run-many-tests $ctlist ""
+} else {
     foreach test $ctlist {
  unsupported "$test"
     }
-
-} else {
-
-  run-many-tests $ctlist ""
-
 }
 
 dg-finish
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 15/16] aarch64: Move x8 out of call_context

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

Reduces stack size.  It was only used by the closure, and there
are available argument registers.
---
 src/aarch64/ffi.c      | 5 ++---
 src/aarch64/internal.h | 2 +-
 src/aarch64/sysv.S     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 4f85140..f546ab2 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -52,7 +52,6 @@ struct call_context
 {
   struct _v v[N_V_ARG_REG];
   UINT64 x[N_X_ARG_REG];
-  UINT64 x8;
 };
 
 #if defined (__clang__) && defined (__APPLE__)
@@ -766,7 +765,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
  void (*fun)(ffi_cif*,void*,void**,void*),
  void *user_data,
  struct call_context *context,
- void *stack, void *rvalue)
+ void *stack, void *rvalue, void *struct_rvalue)
 {
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   int i, h, nargs, flags;
@@ -861,7 +860,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 
   flags = cif->flags;
   if (flags & AARCH64_RET_IN_MEM)
-    rvalue = (void *)(uintptr_t)context->x8;
+    rvalue = struct_rvalue;
 
   fun (cif, rvalue, avalue, user_data);
 
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
index a3070db..9c3e077 100644
--- a/src/aarch64/internal.h
+++ b/src/aarch64/internal.h
@@ -64,4 +64,4 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #define N_X_ARG_REG 8
 #define N_V_ARG_REG 8
-#define CALL_CONTEXT_SIZE (N_V_ARG_REG * 16 + N_X_ARG_REG * 8 + 16)
+#define CALL_CONTEXT_SIZE (N_V_ARG_REG * 16 + N_X_ARG_REG * 8)
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index abd848d..7f00a3f 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -241,7 +241,6 @@ CNAME(ffi_closure_SYSV):
  stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
  stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
  stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
- str     x8,     [sp, #16 + 16*N_V_ARG_REG + 64]
 
  /* Load ffi_closure_inner arguments.  */
  ldp x0, x1, [x17, #FFI_TRAMPOLINE_SIZE] /* load cif, fn */
@@ -249,6 +248,7 @@ CNAME(ffi_closure_SYSV):
  add x3, sp, #16 /* load context */
  add x4, sp, #ffi_closure_SYSV_FS /* load stack */
  add x5, sp, #16+CALL_CONTEXT_SIZE /* load rvalue */
+ mov x6, x8 /* load struct_rval */
  bl      CNAME(ffi_closure_SYSV_inner)
 
  /* Load the return value as directed.  */
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

[PATCH 16/16] aarch64: Add support for Go closures

Richard Henderson
In reply to this post by Richard Henderson
From: Richard Henderson <[hidden email]>

---
 src/aarch64/ffi.c       | 52 +++++++++++++++++++++++++++++++++++++----
 src/aarch64/ffitarget.h |  4 ++++
 src/aarch64/sysv.S      | 62 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f546ab2..0cace9d 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -539,13 +539,14 @@ ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
 #endif /* __APPLE__ */
 
 extern void ffi_call_SYSV (struct call_context *context, void *frame,
-   void (*fn)(void), void *rvalue, int flags)
- FFI_HIDDEN;
+   void (*fn)(void), void *rvalue, int flags,
+   void *closure) FFI_HIDDEN;
 
 /* Call a function with the provided arguments and capture the return
    value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
+      void **avalue, void *closure)
 {
   struct call_context *context;
   void *stack, *frame, *rvalue;
@@ -698,12 +699,27 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 #endif
     }
 
-  ffi_call_SYSV (context, frame, fn, rvalue, flags);
+  ffi_call_SYSV (context, frame, fn, rvalue, flags, closure);
 
   if (flags & AARCH64_RET_NEED_COPY)
     memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+#ifdef FFI_GO_CLOSURES
+void
+ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
+     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+#endif /* FFI_GO_CLOSURES */
+
 /* Build a trampoline.  */
 
 extern void ffi_closure_SYSV (void) FFI_HIDDEN;
@@ -744,6 +760,32 @@ ffi_prep_closure_loc (ffi_closure *closure,
   return FFI_OK;
 }
 
+#ifdef FFI_GO_CLOSURES
+extern void ffi_go_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_go_closure_SYSV_V (void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif* cif,
+                     void (*fun)(ffi_cif*,void*,void**,void*))
+{
+  void (*start)(void);
+
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
+
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_go_closure_SYSV_V;
+  else
+    start = ffi_go_closure_SYSV;
+
+  closure->tramp = start;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+#endif /* FFI_GO_CLOSURES */
+
 /* Primary handler to setup and invoke a function within a closure.
 
    A closure when invoked enters via the assembler wrapper
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 7461386..80d09af 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -50,6 +50,10 @@ typedef enum ffi_abi
 #if defined (__APPLE__)
 #define FFI_TARGET_SPECIFIC_VARIADIC
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
+#else
+/* iOS reserves x18 for the system.  Disable Go closures until
+   a new static chain is chosen.  */
+#define FFI_GO_CLOSURES 1
 #endif
 
 #define FFI_TARGET_HAS_COMPLEX_TYPE
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 7f00a3f..1fb68f2 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -50,7 +50,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
-      void (*fn)(void), void *rvalue, int flags);
+      void (*fn)(void), void *rvalue,
+      int flags, void *closure);
 
    Therefore on entry we have:
 
@@ -59,6 +60,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
    x2 fn
    x3 rvalue
    x4 flags
+   x5 closure
 */
 
  cfi_startproc
@@ -74,6 +76,9 @@ CNAME(ffi_call_SYSV):
 
  mov x9, x2 /* save fn */
  mov x8, x3 /* install structure return */
+#ifdef FFI_GO_CLOSURES
+ mov x18, x5 /* install static chain */
+#endif
  stp x3, x4, [x29, #16] /* save rvalue and flags */
 
  /* Load the vector argument passing registers, if necessary.  */
@@ -245,6 +250,7 @@ CNAME(ffi_closure_SYSV):
  /* Load ffi_closure_inner arguments.  */
  ldp x0, x1, [x17, #FFI_TRAMPOLINE_SIZE] /* load cif, fn */
  ldr x2, [x17, #FFI_TRAMPOLINE_SIZE+16] /* load user_data */
+.Ldo_closure:
  add x3, sp, #16 /* load context */
  add x4, sp, #ffi_closure_SYSV_FS /* load stack */
  add x5, sp, #16+CALL_CONTEXT_SIZE /* load rvalue */
@@ -336,3 +342,57 @@ CNAME(ffi_closure_SYSV):
  .hidden CNAME(ffi_closure_SYSV)
  .size CNAME(ffi_closure_SYSV), . - CNAME(ffi_closure_SYSV)
 #endif
+
+#ifdef FFI_GO_CLOSURES
+ .align 4
+CNAME(ffi_go_closure_SYSV_V):
+ cfi_startproc
+ stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+ cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
+
+ /* Save the argument passing vector registers.  */
+ stp     q0, q1, [sp, #16 + 0]
+ stp     q2, q3, [sp, #16 + 32]
+ stp     q4, q5, [sp, #16 + 64]
+ stp     q6, q7, [sp, #16 + 96]
+ b 0f
+ cfi_endproc
+
+ .globl CNAME(ffi_go_closure_SYSV_V)
+#ifdef __ELF__
+ .type CNAME(ffi_go_closure_SYSV_V), #function
+ .hidden CNAME(ffi_go_closure_SYSV_V)
+ .size CNAME(ffi_go_closure_SYSV_V), . - CNAME(ffi_go_closure_SYSV_V)
+#endif
+
+ .align 4
+ cfi_startproc
+CNAME(ffi_go_closure_SYSV):
+ stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+ cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
+0:
+ mov     x29, sp
+
+ /* Save the argument passing core registers.  */
+ stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+ stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+ stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+ stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+
+ /* Load ffi_closure_inner arguments.  */
+ ldp x0, x1, [x18, #8] /* load cif, fn */
+ mov x2, x18 /* load user_data */
+ b .Ldo_closure
+ cfi_endproc
+
+ .globl CNAME(ffi_go_closure_SYSV)
+#ifdef __ELF__
+ .type CNAME(ffi_go_closure_SYSV), #function
+ .hidden CNAME(ffi_go_closure_SYSV)
+ .size CNAME(ffi_go_closure_SYSV), . - CNAME(ffi_go_closure_SYSV)
+#endif
+#endif /* FFI_GO_CLOSURES */
--
1.9.3

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 00/16] Go closures for aarch64

James Greenhalgh-2
In reply to this post by Richard Henderson
On Tue, Oct 28, 2014 at 06:52:57PM +0000, Richard Henderson wrote:
> This patch set fixes a compilation error since the iOS merge,
> tidies up the port significantly, and finally adds support for
> complex and Go closures.

Hi Richard,

Possibly an irrelevant comment for this patch series, but while rewriting
the world, did you consider Jakub's comments in this thread on
gcc-patches regarding .note.GNU-stack notes?
  https://gcc.gnu.org/ml/gcc-patches/2014-09/msg00820.html

"I've noticed that on 4.8 branch libgo recently (in the last few months)
  started being linked with
    GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x10
  i.e. requiring executable stack on powerpc-linux (32-bit).

  The problem is that we link into libffi linux64.o and linux64_closure.o
  unconditionally, both for 32-bit and 64-bit compilations, just for 32-bit
  ones all the assembly is ifdefed out, so they have just empty sections.
  The .note.GNU-stack section isn't emitted in that case either, which means
  that the linker conservatively treats those as possibly needing executable
  stack.

  The following patch should fix that, ok for trunk/4.9/4.8?

  BTW, I wonder if e.g. libffi/src/arm/trampoline.S or
  libffi/src/aarch64/sysv.S shouldn't have those notes too (note, both of
  those were added after 2008 when most of the *.S files were marked that
  way)."

If it doesn't belong in this series, I'll propose a patch adding it once
your patches have gone in.

Thanks,
James

> Richard Henderson (16):
>   aarch64: Fix non-apple compilation
>   aarch64: Improve is_hfa
>   aarch64: Always distinguish LONGDOUBLE
>   aarch64: Simplify AARCH64_STACK_ALIGN
>   aarch64: Reduce the size of register_context
>   aarch64: Use correct return registers
>   aarch64: Treat void return as not passed in registers
>   aarch64: Tidy up abi manipulation
>   aarch64: Merge prep_args with ffi_call
>   aarch64: Move return value handling into ffi_call_SYSV
>   aarch64: Move return value handling into ffi_closure_SYSV
>   aarch64: Unify scalar fp and hfa handling
>   aarch64: Remove aarch64_flags
>   aarch64: Add support for complex types
>   aarch64: Move x8 out of call_context
>   aarch64: Add support for Go closures
>
>  src/aarch64/ffi.c              | 1477 ++++++++++++++++------------------------
>  src/aarch64/ffitarget.h        |   14 +-
>  src/aarch64/internal.h         |   67 ++
>  src/aarch64/sysv.S             |  589 +++++++++-------
>  testsuite/libffi.call/call.exp |   10 +-
>  5 files changed, 1008 insertions(+), 1149 deletions(-)
>  create mode 100644 src/aarch64/internal.h
>
> --
> 1.9.3
>
>
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 00/16] Go closures for aarch64

Richard Henderson
On 11/10/2014 11:12 AM, James Greenhalgh wrote:
> If it doesn't belong in this series, I'll propose a patch adding it once
> your patches have gone in.

I hadn't considered missing gnu stack markers.
Fixing any that are still missing after my patch set would be most appreciated.


r~
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 08/16] aarch64: Tidy up abi manipulation

Andreas Schwab-2
In reply to this post by Richard Henderson
On Okt 28 2014, Richard Henderson <[hidden email]> wrote:

> + case FFI_TYPE_STRUCT:
> +  {
> +    void *dest;
> +    int elems;
> +
> +    h = is_hfa (ty);
> +    if (h)
> +      {
> +    do_hfa:
> + elems = h >> 8;
> +        if (state.nsrn + elems <= N_V_ARG_REG)
> +  {
> +    dest = &context->v[state.nsrn];
> +    state.nsrn += elems;
> +    extend_hfa_type (dest, a, h);
> +    break;
> +  }
> + state.nsrn = N_V_ARG_REG;
> + dest = allocate_to_stack (&state, stack, ty->alignment, s);
> +      }
> +    else if (s > 16)
> +      {
> + /* If the argument is a composite type that is larger than 16
> +   bytes, then the argument has been copied to memory, and
> +   the argument is replaced by a pointer to the copy.  */
> + a = &avalue[i];
> + t = FFI_TYPE_POINTER;
> + goto do_pointer;

I don't see where the argument has been copied to memory.  Doesn't that
need to call allocate_to_stack here?

Andreas.

--
Andreas Schwab, [hidden email]
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."