[PATCH v2 00/14] ARM improvements

classic Classic list List threaded Threaded
35 messages Options
12
Reply | Threaded
Open this post in threaded view
|

[PATCH v2 00/14] ARM improvements

Richard Henderson
I believe I've now addressed all of the comments to date.

Patches 4-8 have been approved, but they touch the same code
as patches 1-3, so I'm not rearranging them.

Patch 1 -- Spurious whitespace changes removed.
Patch 2 -- I never saw a review for this one?
Patch 3 -- Totally rewritten, in preparation for new patch for hard-tp.
Patch 6 -- Review mentioned deleting __APCS_32__, but this was already
           done in patch 7, so I didn't rearrange that.
Patch 9-10 -- New.
Patch 11-12 -- Micro-optimization suggestions from Mans Rullgard applied.
Patch 14 -- Adjusted to avoid r9.

I've yet to test the string routines on big-endian.  I'll delay
re-posting those until I have done so.


r~


Richard Henderson (14):
  arm: Introduce and use LDST_PCREL
  arm: Introduce and use NEGOFF series of macros
  arm: Introduce and use GET_TLS
  arm: Enable thumb2 mode in assembly files
  arm: Use push/pop mnemonics
  arm: Delete LOADREGS macro
  arm: Commonize BX conditionals
  arm: Unless arm4t, pop return address directly into pc
  arm: Tidy architecture selection
  arm: Implement hard-tp for GET_TLS
  arm: Add optimized ffs for armv6t2
  arm: Add optimized addmul_1
  arm: Add optimized submul_1
  arm: Add optimized add_n and sub_n

 ports/sysdeps/arm/__longjmp.S                      |   2 +-
 ports/sysdeps/arm/add_n.S                          |  83 ++++++++++++
 ports/sysdeps/arm/addmul_1.S                       |  66 +++++++++
 ports/sysdeps/arm/arm-mcount.S                     |  10 +-
 ports/sysdeps/arm/armv6t2/ffs.S                    |  35 +++++
 ports/sysdeps/arm/armv6t2/ffsll.S                  |  50 +++++++
 ports/sysdeps/arm/crti.S                           |   4 +-
 ports/sysdeps/arm/crtn.S                           |   8 +-
 ports/sysdeps/arm/dl-machine.h                     |   2 +-
 ports/sysdeps/arm/dl-tlsdesc.S                     |  42 +++---
 ports/sysdeps/arm/dl-trampoline.S                  |  13 +-
 ports/sysdeps/arm/memcpy.S                         |  58 ++++----
 ports/sysdeps/arm/memmove.S                        |  58 ++++----
 ports/sysdeps/arm/start.S                          |  10 +-
 ports/sysdeps/arm/sub_n.S                          |   2 +
 ports/sysdeps/arm/submul_1.S                       |  67 ++++++++++
 ports/sysdeps/arm/sysdep.h                         | 147 +++++++++++++++++----
 ports/sysdeps/unix/arm/sysdep.S                    |  32 ++---
 .../sysdeps/unix/sysv/linux/arm/____longjmp_chk.S  |   4 +-
 ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S  |   9 ++
 ports/sysdeps/unix/sysv/linux/arm/clone.S          |  13 +-
 ports/sysdeps/unix/sysv/linux/arm/mmap.S           |   8 +-
 ports/sysdeps/unix/sysv/linux/arm/mmap64.S         |   8 +-
 ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S  |  21 +--
 .../unix/sysv/linux/arm/nptl/sysdep-cancel.h       |  47 +++----
 .../unix/sysv/linux/arm/nptl/unwind-forcedunwind.c |   4 +-
 .../unix/sysv/linux/arm/nptl/unwind-resume.c       |   4 +-
 ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S     |  24 ++--
 ports/sysdeps/unix/sysv/linux/arm/syscall.S        |   4 +-
 ports/sysdeps/unix/sysv/linux/arm/sysdep.h         |  69 +++++++---
 ports/sysdeps/unix/sysv/linux/arm/vfork.S          |   2 +-
 31 files changed, 648 insertions(+), 258 deletions(-)
 create mode 100644 ports/sysdeps/arm/add_n.S
 create mode 100644 ports/sysdeps/arm/addmul_1.S
 create mode 100644 ports/sysdeps/arm/armv6t2/ffs.S
 create mode 100644 ports/sysdeps/arm/armv6t2/ffsll.S
 create mode 100644 ports/sysdeps/arm/sub_n.S
 create mode 100644 ports/sysdeps/arm/submul_1.S

--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 01/14] arm: Introduce and use LDST_PCREL

Richard Henderson
Macro-ising the few instances where we need to distinguish between
arm and thumb pc-relative memory operations.
---
        * sysdeps/arm/sysdep.h (LDST_PCREL): New macro.
        * sysdeps/unix/arm/sysdep.S (__syscall_error): Use LDST_PCREL.
        Fix up gottpoff load of errno for thumb2.
        * sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
        (SINGLE_THREAD_P): Use LDST_PCREL.
        (PSEUDO_PROLOGUE): Remove.
        (PSEUDO): Don't use it.
        * sysdeps/unix/sysv/linux/arm/sysdep.h (SYSCALL_ERROR_HANDLER):
        Use LDST_PCREL.
---
 ports/sysdeps/arm/sysdep.h                         | 17 +++++++++++++++++
 ports/sysdeps/unix/arm/sysdep.S                    | 22 ++++++++++++----------
 .../unix/sysv/linux/arm/nptl/sysdep-cancel.h       | 10 ++--------
 ports/sysdeps/unix/sysv/linux/arm/sysdep.h         | 10 ++++------
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index 4af7429..29a78f0 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -117,6 +117,23 @@
    the caller.  */
  .eabi_attribute 24, 1
 
+/* Load or store to/from a pc-relative EXPR into/from R, using T.  */
+# ifdef __thumb2__
+#  define LDST_PCREL(OP, R, T, EXPR) \
+ ldr T, 98f; \
+ .subsection 2; \
+98: .word EXPR - 99f - PC_OFS; \
+ .previous; \
+99: add T, T, pc; \
+ OP R, [T]
+# else
+#  define LDST_PCREL(OP, R, T, EXPR) \
+ ldr T, 98f; \
+ .subsection 2; \
+98: .word EXPR - 99f - PC_OFS; \
+ .previous; \
+99: OP R, [pc, T]
+# endif
 #endif /* __ASSEMBLER__ */
 
 /* This number is the offset from the pc at the current location.  */
diff --git a/ports/sysdeps/unix/arm/sysdep.S b/ports/sysdeps/unix/arm/sysdep.S
index 40e4d80..d44ee48 100644
--- a/ports/sysdeps/unix/arm/sysdep.S
+++ b/ports/sysdeps/unix/arm/sysdep.S
@@ -45,20 +45,22 @@ __syscall_error:
  mov lr, pc
  sub pc, r0, #31
 
- ldr r2, 1f
-2: ldr r2, [pc, r2]
- str r1, [r0, r2]
- mvn r0, #0
- RETINSTR (, ip)
+ ldr r2, 1f
+#ifdef __thumb__
+2: add r2, r2, pc
+ ldr r2, [r2]
+#else
+2: ldr r2, [pc, r2]
+#endif
+ str r1, [r0, r2]
+ mvn r0, #0
+ DO_RET(ip)
 
 1: .word errno(gottpoff) + (. - 2b - PC_OFS)
 #elif RTLD_PRIVATE_ERRNO
- ldr r1, 1f
-0: str r0, [pc, r1]
- mvn r0, $0
+ LDST_PCREL(str, r0, r1, C_SYMBOL_NAME(rtld_errno))
+ mvn r0, #0
  DO_RET(r14)
-
-1: .word C_SYMBOL_NAME(rtld_errno) - 0b - PC_OFS
 #else
 #error "Unsupported non-TLS case"
 #endif
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
index df85d51..8889369 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
@@ -31,7 +31,6 @@
 # undef PSEUDO
 # define PSEUDO(name, syscall_name, args) \
  .text; \
- PSEUDO_PROLOGUE; \
   ENTRY (__##syscall_name##_nocancel); \
  CFI_SECTIONS; \
  DO_CALL (syscall_name, args); \
@@ -203,12 +202,8 @@ extern int __local_multiple_threads attribute_hidden;
 #   define SINGLE_THREAD_P __builtin_expect (__local_multiple_threads == 0, 1)
 #  else
 #   define SINGLE_THREAD_P \
- ldr ip, 1b; \
-  2: \
- ldr ip, [pc, ip]; \
- teq ip, #0;
-#   define PSEUDO_PROLOGUE \
-  1: .word __local_multiple_threads - 2f - PC_OFS;
+ LDST_PCREL(ldr, ip, ip, __local_multiple_threads); \
+ teq ip, #0
 #  endif
 # else
 /*  There is no __local_multiple_threads for librt, so use the TCB.  */
@@ -217,7 +212,6 @@ extern int __local_multiple_threads attribute_hidden;
   __builtin_expect (THREAD_GETMEM (THREAD_SELF, \
    header.multiple_threads) == 0, 1)
 #  else
-#   define PSEUDO_PROLOGUE
 #   define SINGLE_THREAD_P \
  stmfd sp!, {r0, lr}; \
  cfi_adjust_cfa_offset (8); \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
index f40cb95..89208a9 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
@@ -110,12 +110,10 @@
 # if RTLD_PRIVATE_ERRNO
 #  define SYSCALL_ERROR_HANDLER \
 __local_syscall_error: \
-       ldr     r1, 1f; \
-       rsb     r0, r0, #0; \
-0:     str     r0, [pc, r1]; \
-       mvn     r0, #0; \
-       DO_RET(lr); \
-1:     .word C_SYMBOL_NAME(rtld_errno) - 0b - PC_OFS;
+ rsb r0, r0, #0; \
+ LDST_PCREL(str, r0, r1, C_SYMBOL_NAME(rtld_errno)); \
+ mvn r0, #0; \
+ DO_RET(lr)
 # else
 #  if defined(__ARM_ARCH_4T__) && defined(__THUMB_INTERWORK__)
 #   define POP_PC \
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 02/14] arm: Introduce and use NEGOFF series of macros

Richard Henderson
In reply to this post by Richard Henderson
There are several places in which we access negative offsets from
the thread-pointer, but thumb2 only supports positive offsets in
memory references.

Avoid duplicating the rather large macros in which these references
are embedded by abstracting out the operation.
---
        * sysdeps/arm/sysdep.h (NEGOFF_ADJ_BASE): New macro.
        (NEGOFF_ADJ_BASE2, NEGOFF_OFF1, NEGOFF_OFF2): New macros.
        * sysdeps/unix/sysv/linux/arm/clone.S (__clone): Use them.
        * sysdeps/unix/sysv/linux/arm/nptl/vfork.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h (SINGLE_THREAD_P):
        Likewise.
---
 ports/sysdeps/arm/sysdep.h                             | 16 ++++++++++++++++
 ports/sysdeps/unix/sysv/linux/arm/clone.S              |  5 +++--
 ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S      | 11 ++++++-----
 ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h |  3 ++-
 ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S         | 14 ++++++++------
 5 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index 29a78f0..9230131 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -134,6 +134,22 @@
  .previous; \
 99: OP R, [pc, T]
 # endif
+
+/* Cope with negative memory offsets, which thumb can't encode.
+   Use NEGOFF_ADJ_BASE to (conditionally) alter the base register,
+   and then NEGOFF_OFF1 to use 0 for thumb and the offset for arm,
+   or NEGOFF_OFF2 to use A-B for thumb and A for arm.  */
+# ifdef __thumb2__
+#  define NEGOFF_ADJ_BASE(R, OFF) add R, R, $OFF
+#  define NEGOFF_ADJ_BASE2(D, S, OFF) add D, S, $OFF
+#  define NEGOFF_OFF1(R, OFF) [R]
+#  define NEGOFF_OFF2(R, OFFA, OFFB) [R, $((OFFA) - (OFFB))]
+# else
+#  define NEGOFF_ADJ_BASE(R, OFF)
+#  define NEGOFF_ADJ_BASE2(D, S, OFF) mov D, S
+#  define NEGOFF_OFF1(R, OFF) [R, $OFF]
+#  define NEGOFF_OFF2(R, OFFA, OFFB) [R, $OFFA]
+# endif
 #endif /* __ASSEMBLER__ */
 
 /* This number is the offset from the pc at the current location.  */
diff --git a/ports/sysdeps/unix/sysv/linux/arm/clone.S b/ports/sysdeps/unix/sysv/linux/arm/clone.S
index 732a3ff..a5f9b4d 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/clone.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/clone.S
@@ -83,8 +83,9 @@ PSEUDO_END (__clone)
  ite ne
  movne r0, #-1
  swieq 0x0
- str r0, [r1, #PID_OFFSET]
- str r0, [r1, #TID_OFFSET]
+ NEGOFF_ADJ_BASE(r1, TID_OFFSET)
+ str r0, NEGOFF_OFF1(r1, TID_OFFSET)
+ str r0, NEGOFF_OFF2(r1, PID_OFFSET, TID_OFFSET)
 3:
 #endif
  @ pick the function arg and call address off the stack and execute
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S b/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
index a38d564..ff88510 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
@@ -28,14 +28,15 @@
  ldr lr, [sp], #4; /* Restore LR.  */ \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (lr); \
- mov r2, r0; /* Save the TLS addr in r2.  */ \
- ldr r3, [r2, #PID_OFFSET]; /* Load the saved PID.  */ \
- rsb r0, r3, #0; /* Negate it.  */ \
- str r0, [r2, #PID_OFFSET] /* Store the temporary PID.  */
+ NEGOFF_ADJ_BASE2(r2, r0, PID_OFFSET); /* Save the TLS addr in r2. */ \
+ ldr r3, NEGOFF_OFF1(r2, PID_OFFSET); /* Load the saved PID.  */  \
+ rsb r0, r3, #0; /* Negate it.  */     \
+ str r0, NEGOFF_OFF1(r2, PID_OFFSET); /* Store the temp PID.  */
 
 /* Restore the old PID value in the parent.  */
 #define RESTORE_PID \
  cmp r0, #0; /* If we are the parent... */ \
- strne r3, [r2, #PID_OFFSET] /* ... restore the saved PID.  */
+ it ne; \
+ strne r3, NEGOFF_OFF1(r2, PID_OFFSET); /* restore the saved PID.  */
 
 #include "../vfork.S"
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
index 8889369..47d4c70 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
@@ -217,7 +217,8 @@ extern int __local_multiple_threads attribute_hidden;
  cfi_adjust_cfa_offset (8); \
  cfi_rel_offset (lr, 4); \
  bl __aeabi_read_tp; \
- ldr ip, [r0, #MULTIPLE_THREADS_OFFSET]; \
+ NEGOFF_ADJ_BASE(r0, MULTIPLE_THREADS_OFFSET); \
+ ldr ip, NEGOFF_OFF1(r0, MULTIPLE_THREADS_OFFSET); \
  ldmfd sp!, {r0, lr}; \
  cfi_adjust_cfa_offset (-8); \
  cfi_restore (lr); \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S b/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
index 3fce2d1..c4be1e2 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
@@ -28,15 +28,17 @@
  ldr lr, [sp], #4; /* Restore LR.  */ \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (lr); \
- mov r2, r0; /* Save the TLS addr in r2.  */ \
- ldr r3, [r2, #PID_OFFSET]; /* Load the saved PID.  */ \
- rsbs r0, r3, #0; /* Negate it.  */ \
- moveq r0, #0x80000000; /* Use 0x80000000 if it was 0.  */ \
- str r0, [r2, #PID_OFFSET] /* Store the temporary PID.  */
+ NEGOFF_ADJ_BASE2(r2, r0, PID_OFFSET); /* Save the TLS addr in r2.  */ \
+ ldr r3, NEGOFF_OFF1(r2, PID_OFFSET); /* Load the saved PID.  */   \
+ rsbs r0, r3, #0; /* Negate it.  */      \
+ it eq;      \
+ moveq r0, #0x80000000; /* Use 0x80000000 if it was 0.  */    \
+ str r0, NEGOFF_OFF1(r2, PID_OFFSET); /* Store the temp PID.  */
 
 /* Restore the old PID value in the parent.  */
 #define RESTORE_PID \
  cmp r0, #0; /* If we are the parent... */ \
- strne r3, [r2, #PID_OFFSET] /* ... restore the saved PID.  */
+ it ne; \
+ strne r3, NEGOFF_OFF1(r2, PID_OFFSET); /* restore the saved PID.  */
 
 #include "../vfork.S"
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 03/14] arm: Introduce and use GET_TLS

Richard Henderson
In reply to this post by Richard Henderson
Factor out the sequence needed to call kuser_get_tls, as we can't
play subtract into pc games in thumb mode.  Prepare for hard-tp,
pulling the save of LR into the macro.
---
        * sysdeps/arm/sysdep.h (GET_TLS): New macro.
        * sysdeps/arm/dl-tlsdesc.S (_dl_tlsdesc_undefweak): Use it.
        (_dl_tlsdesc_dynamic): Likewise.
        * sysdeps/unix/arm/sysdep.S (__syscall_error): Likewise.
        * sysdeps/unix/sysv/linux/arm/sysdep.h (GET_TLS): New macro.
        * sysdeps/unix/sysv/linux/arm/clone.S (__clone): Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S (SAVE_PID): Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/vfork.S (SAVE_PID): Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h (SINGLE_THREAD_P):
        Likewise.
        * sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S (__aeabi_read_tp):
        Add thumb2 alternative.
---
 ports/sysdeps/arm/dl-tlsdesc.S                     | 13 ++--------
 ports/sysdeps/arm/sysdep.h                         | 19 ++++++++++++++
 ports/sysdeps/unix/arm/sysdep.S                    | 12 +++------
 ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S  |  6 +++++
 ports/sysdeps/unix/sysv/linux/arm/clone.S          |  4 +--
 ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S  | 10 +-------
 .../unix/sysv/linux/arm/nptl/sysdep-cancel.h       |  2 +-
 ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S     | 10 +-------
 ports/sysdeps/unix/sysv/linux/arm/sysdep.h         | 30 ++++++++++++++++++++++
 9 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/ports/sysdeps/arm/dl-tlsdesc.S b/ports/sysdeps/arm/dl-tlsdesc.S
index 7b4c8df..1c3bccf 100644
--- a/ports/sysdeps/arm/dl-tlsdesc.S
+++ b/ports/sysdeps/arm/dl-tlsdesc.S
@@ -50,18 +50,9 @@ _dl_tlsdesc_return:
  .fnstart
  .align 2
 _dl_tlsdesc_undefweak:
- @ Are we allowed a misaligned stack pointer calling read_tp?
- .save {lr}
- stmdb sp!, {lr}
- cfi_adjust_cfa_offset (4)
- cfi_rel_offset (lr,0)
- bl __aeabi_read_tp
+ GET_TLS(r1)
  rsb r0, r0, #0
- ldmia sp!, {lr}
- cfi_adjust_cfa_offset (-4)
- cfi_restore (lr)
  BX (lr)
-
  cfi_endproc
  .fnend
  .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
@@ -106,7 +97,7 @@ _dl_tlsdesc_dynamic:
  cfi_rel_offset (r4,8)
  cfi_rel_offset (lr,12)
  ldr r1, [r0] /* td */
- bl __aeabi_read_tp
+ GET_TLS(lr)
  mov r4, r0 /* r4 = tp */
  ldr r0, [r0]
  ldr r2, [r1, #8] /* gen_count */
diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index 9230131..c525d5b 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -150,6 +150,25 @@
 #  define NEGOFF_OFF1(R, OFF) [R, $OFF]
 #  define NEGOFF_OFF2(R, OFFA, OFFB) [R, $OFFA]
 # endif
+
+/* Helper to get the TLS base pointer.  The interface is that TMP is a
+   register that may be used to hold the LR, if necessary.  TMP may be
+   LR itself to indicate that LR need not be saved.  The base pointer
+   is returned in R0.  Only R0 and TMP are modified.
+
+   At this generic level we have no tricks to pull.  Call the ABI routine.  */
+# define GET_TLS(TMP) \
+ push { r1, r2, r3, lr }; \
+ cfi_remember_state; \
+ cfi_adjust_cfa_offset (16); \
+ cfi_rel_offset (r1, 0); \
+ cfi_rel_offset (r2, 4); \
+ cfi_rel_offset (r3, 8); \
+ cfi_rel_offset (lr, 12); \
+ bl __aeabi_read_tp; \
+ pop { r1, r2, r3, lr }; \
+ cfi_restore_state
+
 #endif /* __ASSEMBLER__ */
 
 /* This number is the offset from the pc at the current location.  */
diff --git a/ports/sysdeps/unix/arm/sysdep.S b/ports/sysdeps/unix/arm/sysdep.S
index d44ee48..b07cba9 100644
--- a/ports/sysdeps/unix/arm/sysdep.S
+++ b/ports/sysdeps/unix/arm/sysdep.S
@@ -37,14 +37,8 @@ __syscall_error:
 #endif
 
 #ifndef IS_IN_rtld
- mov ip, lr
- cfi_register (lr, ip)
- mov r1, r0
-
- mov r0, #0xffff0fff
- mov lr, pc
- sub pc, r0, #31
-
+ mov r1, r0
+ GET_TLS(r2)
  ldr r2, 1f
 #ifdef __thumb__
 2: add r2, r2, pc
@@ -54,7 +48,7 @@ __syscall_error:
 #endif
  str r1, [r0, r2]
  mvn r0, #0
- DO_RET(ip)
+ DO_RET(lr)
 
 1: .word errno(gottpoff) + (. - 2b - PC_OFS)
 #elif RTLD_PRIVATE_ERRNO
diff --git a/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S b/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
index c4ddbc6..ecdc322 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
@@ -41,6 +41,12 @@
 
  .hidden __aeabi_read_tp
 ENTRY (__aeabi_read_tp)
+#ifdef __thumb2__
+ movw r0, #0x0fe0
+ movt r0, #0xffff
+ bx r0
+#else
  mov r0, #0xffff0fff
  sub pc, r0, #31
+#endif
 END (__aeabi_read_tp)
diff --git a/ports/sysdeps/unix/sysv/linux/arm/clone.S b/ports/sysdeps/unix/sysv/linux/arm/clone.S
index a5f9b4d..1bc5eab 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/clone.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/clone.S
@@ -74,9 +74,7 @@ PSEUDO_END (__clone)
 #ifdef RESET_PID
  tst ip, #CLONE_THREAD
  bne 3f
- mov r0, #0xffff0fff
- mov lr, pc
- sub pc, r0, #31
+ GET_TLS(lr)
  mov r1, r0
  tst ip, #CLONE_VM
  ldr r7, =SYS_ify(getpid)
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S b/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
index ff88510..c731cd7 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/pt-vfork.S
@@ -19,15 +19,7 @@
 
 /* Save the PID value.  */
 #define SAVE_PID \
- str lr, [sp, #-4]!; /* Save LR.  */ \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (lr, 0); \
- mov r0, #0xffff0fff; /* Point to the high page.  */ \
- mov lr, pc; /* Save our return address.  */ \
- sub pc, r0, #31; /* Jump to the TLS entry.  */ \
- ldr lr, [sp], #4; /* Restore LR.  */ \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (lr); \
+ GET_TLS(r2); \
  NEGOFF_ADJ_BASE2(r2, r0, PID_OFFSET); /* Save the TLS addr in r2. */ \
  ldr r3, NEGOFF_OFF1(r2, PID_OFFSET); /* Load the saved PID.  */  \
  rsb r0, r3, #0; /* Negate it.  */     \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
index 47d4c70..d5e666b 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
@@ -216,7 +216,7 @@ extern int __local_multiple_threads attribute_hidden;
  stmfd sp!, {r0, lr}; \
  cfi_adjust_cfa_offset (8); \
  cfi_rel_offset (lr, 4); \
- bl __aeabi_read_tp; \
+ GET_TLS(lr); \
  NEGOFF_ADJ_BASE(r0, MULTIPLE_THREADS_OFFSET); \
  ldr ip, NEGOFF_OFF1(r0, MULTIPLE_THREADS_OFFSET); \
  ldmfd sp!, {r0, lr}; \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S b/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
index c4be1e2..accecf2 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/vfork.S
@@ -19,15 +19,7 @@
 
 /* Save the PID value.  */
 #define SAVE_PID \
- str lr, [sp, #-4]!; /* Save LR.  */ \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (lr, 0); \
- mov r0, #0xffff0fff; /* Point to the high page.  */ \
- mov lr, pc; /* Save our return address.  */ \
- sub pc, r0, #31; /* Jump to the TLS entry.  */ \
- ldr lr, [sp], #4; /* Restore LR.  */ \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (lr); \
+ GET_TLS(r2); \
  NEGOFF_ADJ_BASE2(r2, r0, PID_OFFSET); /* Save the TLS addr in r2.  */ \
  ldr r3, NEGOFF_OFF1(r2, PID_OFFSET); /* Load the saved PID.  */   \
  rsbs r0, r3, #0; /* Negate it.  */      \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
index 89208a9..01d8123 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
@@ -45,6 +45,36 @@
 
 #ifdef __ASSEMBLER__
 
+/* Internal macro calling the linux kernel kuser_get_tls helper.
+   Note that in thumb mode, a constant pool break is often out of range, so
+   we always expand the constant inline.  */
+#ifdef __thumb2__
+# define GET_TLS_BODY \
+ movw r0, #0x0fe0; \
+ movt r0, #0xffff; \
+ blx r0
+#else
+# define GET_TLS_BODY \
+ mov r0, #0xffff0fff; /* Point to the high page.  */ \
+ mov lr, pc; /* Save our return address.  */ \
+ sub pc, r0, #31 /* Jump to the TLS entry.  */
+#endif
+
+/* Helper to get the TLS base pointer.  Save LR in TMP, return in R0,
+   and no other registers clobbered.  TMP may be LR itself to indicate
+   that no save is necessary.  */
+#undef GET_TLS
+#define GET_TLS(TMP) \
+  .ifnc TMP, lr; \
+ mov TMP, lr; \
+ cfi_register (lr, TMP); \
+ GET_TLS_BODY; \
+ mov lr, TMP; \
+ cfi_restore (lr); \
+  .else; \
+ GET_TLS_BODY; \
+  .endif
+
 /* Linux uses a negative return value to indicate syscall errors,
    unlike most Unices, which use the condition codes' carry flag.
 
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 04/14] arm: Enable thumb2 mode in assembly files

Richard Henderson
In reply to this post by Richard Henderson
The preceeding patches have allowed for the few incompatibilities
between arm and thumb2 mode, or have marked the file as not wanting
to use thumb2 mode.
---
        * sysdeps/arm/sysdep.h [__ASSEMBLER__]: Enable thumb2 if __thumb2__.
        (PC_OFS): Respect __thumb__ if __ASSEMBLER__.
---
 ports/sysdeps/arm/sysdep.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index c525d5b..d855ceb 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -117,6 +117,16 @@
    the caller.  */
  .eabi_attribute 24, 1
 
+/* The thumb2 encoding is reasonably complete.  Unless suppressed, use it.  */
+ .syntax unified
+# if defined(__thumb2__) && !defined(NO_THUMB)
+ .thumb
+#else
+#  undef __thumb__
+#  undef __thumb2__
+ .arm
+# endif
+
 /* Load or store to/from a pc-relative EXPR into/from R, using T.  */
 # ifdef __thumb2__
 #  define LDST_PCREL(OP, R, T, EXPR) \
@@ -172,8 +182,7 @@
 #endif /* __ASSEMBLER__ */
 
 /* This number is the offset from the pc at the current location.  */
-/* ??? At the moment we're not turning on thumb mode in assembly.  */
-#if defined(__thumb__) && !defined(__ASSEMBLER__)
+#ifdef __thumb__
 # define PC_OFS  4
 #else
 # define PC_OFS  8
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 05/14] arm: Use push/pop mnemonics

Richard Henderson
In reply to this post by Richard Henderson
For arm this makes no difference--the result is bit-for-bit identical;
for thumb this results in smaller encodings.  Perhaps it ought not and
this is in fact an assembler bug, but I also think it's clearer.
---
        * sysdeps/arm/arm-mcount.S (_mcount): Use push/pop mnemonics.
        * sysdeps/arm/crti.S, sysdeps/arm/crtn.S: Likewise.
        * sysdeps/arm/dl-tlsdesc.S: Likewise.
        * sysdeps/arm/dl-trampoline.S: Likewise.
        * sysdeps/arm/start.S: Likewise.
        * sysdeps/arm/memcpy.S (PULL): Rename macro from pull.
        (PUSH): Rename macro from push.
        (memcpy): Use push/pop mnemonics.
        * sysdeps/arm/memmove.S: Similarly.
        * sysdeps/arm/sysdep.h (CALL_MCOUNT): Use push/pop mnemonics.
        * sysdeps/unix/sysv/linux/arm/____longjmp_chk.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/clone.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/mmap.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/mmap64.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h: Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/unwind-forcedunwind.c: Likewise.
        * sysdeps/unix/sysv/linux/arm/nptl/unwind-resume.c: Likewise.
        * sysdeps/unix/sysv/linux/arm/syscall.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/sysdep.h: Likewise.
        * sysdeps/unix/sysv/linux/arm/vfork.S: Likewise.
---
 ports/sysdeps/arm/arm-mcount.S                     |  6 +--
 ports/sysdeps/arm/crti.S                           |  4 +-
 ports/sysdeps/arm/crtn.S                           |  8 +--
 ports/sysdeps/arm/dl-tlsdesc.S                     | 16 +++---
 ports/sysdeps/arm/dl-trampoline.S                  |  4 +-
 ports/sysdeps/arm/memcpy.S                         | 58 +++++++++++-----------
 ports/sysdeps/arm/memmove.S                        | 58 +++++++++++-----------
 ports/sysdeps/arm/start.S                          | 10 ++--
 ports/sysdeps/arm/sysdep.h                         |  6 +--
 .../sysdeps/unix/sysv/linux/arm/____longjmp_chk.S  |  4 +-
 ports/sysdeps/unix/sysv/linux/arm/clone.S          |  4 +-
 ports/sysdeps/unix/sysv/linux/arm/mmap.S           |  8 +--
 ports/sysdeps/unix/sysv/linux/arm/mmap64.S         |  8 +--
 .../unix/sysv/linux/arm/nptl/sysdep-cancel.h       | 32 ++++++------
 .../unix/sysv/linux/arm/nptl/unwind-forcedunwind.c |  4 +-
 .../unix/sysv/linux/arm/nptl/unwind-resume.c       |  4 +-
 ports/sysdeps/unix/sysv/linux/arm/syscall.S        |  4 +-
 ports/sysdeps/unix/sysv/linux/arm/sysdep.h         | 27 +++++-----
 ports/sysdeps/unix/sysv/linux/arm/vfork.S          |  2 +-
 19 files changed, 133 insertions(+), 134 deletions(-)

diff --git a/ports/sysdeps/arm/arm-mcount.S b/ports/sysdeps/arm/arm-mcount.S
index 679d042..b6e5ec7 100644
--- a/ports/sysdeps/arm/arm-mcount.S
+++ b/ports/sysdeps/arm/arm-mcount.S
@@ -69,7 +69,7 @@ END(__gnu_mcount_nc)
    code be compiled with APCS frame pointers.  */
 
 ENTRY(_mcount)
- stmdb sp!, {r0, r1, r2, r3, fp, lr}
+ push {r0, r1, r2, r3, fp, lr}
  cfi_adjust_cfa_offset (24)
  cfi_rel_offset (r0, 0)
  cfi_rel_offset (r1, 4)
@@ -83,9 +83,9 @@ ENTRY(_mcount)
  movsne r1, lr
  blne __mcount_internal
 #ifdef __thumb2__
- ldmia sp!, {r0, r1, r2, r3, fp, pc}
+ pop {r0, r1, r2, r3, fp, pc}
 #else
- ldmia sp!, {r0, r1, r2, r3, fp, lr}
+ pop {r0, r1, r2, r3, fp, lr}
  cfi_adjust_cfa_offset (-24)
  cfi_restore (r0)
  cfi_restore (r1)
diff --git a/ports/sysdeps/arm/crti.S b/ports/sysdeps/arm/crti.S
index 1d55ae2..be20a11 100644
--- a/ports/sysdeps/arm/crti.S
+++ b/ports/sysdeps/arm/crti.S
@@ -80,7 +80,7 @@ call_weak_fn:
  .globl _init
  .type _init, %function
 _init:
- stmfd sp!, {r3, lr}
+ push {r3, lr}
 #if PREINIT_FUNCTION_WEAK
  bl call_weak_fn
 #else
@@ -92,4 +92,4 @@ _init:
  .globl _fini
  .type _fini, %function
 _fini:
- stmfd sp!, {r3, lr}
+ push {r3, lr}
diff --git a/ports/sysdeps/arm/crtn.S b/ports/sysdeps/arm/crtn.S
index a01eb01..ae7546c 100644
--- a/ports/sysdeps/arm/crtn.S
+++ b/ports/sysdeps/arm/crtn.S
@@ -42,16 +42,16 @@
 
  .section .init,"ax",%progbits
 #ifdef __ARM_ARCH_4T__
- ldmfd sp!, {r3, lr}
+ pop {r3, lr}
  bx lr
 #else
- ldmfd sp!, {r3, pc}
+ pop {r3, pc}
 #endif
 
  .section .fini,"ax",%progbits
 #ifdef __ARM_ARCH_4T__
- ldmfd sp!, {r3, lr}
+ pop {r3, lr}
  bx lr
 #else
- ldmfd sp!, {r3, pc}
+ pop {r3, pc}
 #endif
diff --git a/ports/sysdeps/arm/dl-tlsdesc.S b/ports/sysdeps/arm/dl-tlsdesc.S
index 1c3bccf..3a956de 100644
--- a/ports/sysdeps/arm/dl-tlsdesc.S
+++ b/ports/sysdeps/arm/dl-tlsdesc.S
@@ -90,7 +90,7 @@ _dl_tlsdesc_dynamic:
  /* Our calling convention is to clobber r0, r1 and the processor
    flags.  All others that are modified must be saved */
  .save {r2,r3,r4,lr}
- stmdb   sp!, {r2,r3,r4,lr}
+ push {r2,r3,r4,lr}
  cfi_adjust_cfa_offset (16)
  cfi_rel_offset (r2,0)
  cfi_rel_offset (r3,4)
@@ -115,7 +115,7 @@ _dl_tlsdesc_dynamic:
 1: mov r0, r1
  bl __tls_get_addr
  rsb r0, r4, r0
-2: ldmia sp!, {r2,r3,r4, lr}
+2: pop {r2,r3,r4, lr}
  cfi_adjust_cfa_offset (-16)
  cfi_restore (lr)
  cfi_restore (r4)
@@ -146,7 +146,7 @@ _dl_tlsdesc_lazy_resolver:
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r2, 0)
  .save {r0,r1,r3,ip,lr}
- stmdb sp!, {r0, r1, r3, ip, lr}
+ push {r0, r1, r3, ip, lr}
  cfi_adjust_cfa_offset (20)
  cfi_rel_offset (r0, 0)
  cfi_rel_offset (r1, 4)
@@ -154,14 +154,14 @@ _dl_tlsdesc_lazy_resolver:
  cfi_rel_offset (ip, 12)
  cfi_rel_offset (lr, 16)
  bl _dl_tlsdesc_lazy_resolver_fixup
- ldmia sp!, {r0, r1, r3, ip, lr}
+ pop {r0, r1, r3, ip, lr}
  cfi_adjust_cfa_offset (-20)
  cfi_restore (lr)
  cfi_restore (ip)
  cfi_restore (r3)
  cfi_restore (r1)
  cfi_restore (r0)
- ldmia sp!, {r2}
+ pop {r2}
  cfi_adjust_cfa_offset (-4)
  cfi_restore (r2)
  ldr r1, [r0, #4]
@@ -184,7 +184,7 @@ _dl_tlsdesc_resolve_hold:
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r2, 0)
  .save {r0,r1,r3,ip,lr}
- stmdb   sp!, {r0, r1, r3, ip, lr}
+ push {r0, r1, r3, ip, lr}
  cfi_adjust_cfa_offset (20)
  cfi_rel_offset (r0, 0)
  cfi_rel_offset (r1, 4)
@@ -193,14 +193,14 @@ _dl_tlsdesc_resolve_hold:
  cfi_rel_offset (lr, 16)
  adr r2, _dl_tlsdesc_resolve_hold
  bl _dl_tlsdesc_resolve_hold_fixup
- ldmia   sp!, {r0, r1, r3, ip, lr}
+ pop {r0, r1, r3, ip, lr}
  cfi_adjust_cfa_offset (-20)
  cfi_restore (lr)
  cfi_restore (ip)
  cfi_restore (r3)
  cfi_restore (r1)
  cfi_restore (r0)
- ldmia   sp!, {r2}
+ pop {r2}
  cfi_adjust_cfa_offset (-4)
  cfi_restore (r2)
  ldr     r1, [r0, #4]
diff --git a/ports/sysdeps/arm/dl-trampoline.S b/ports/sysdeps/arm/dl-trampoline.S
index 561d8ae..f2d1679 100644
--- a/ports/sysdeps/arm/dl-trampoline.S
+++ b/ports/sysdeps/arm/dl-trampoline.S
@@ -43,7 +43,7 @@ _dl_runtime_resolve:
  @ lr points to &GOT[2]
 
  @ Save arguments.  We save r4 to realign the stack.
- stmdb sp!,{r0-r4}
+ push {r0-r4}
  cfi_adjust_cfa_offset (20)
  cfi_rel_offset (r0, 0)
  cfi_rel_offset (r1, 4)
@@ -67,7 +67,7 @@ _dl_runtime_resolve:
 
  @ get arguments and return address back.  We restore r4
  @ only to realign the stack.
- ldmia sp!, {r0-r4,lr}
+ pop {r0-r4,lr}
  cfi_adjust_cfa_offset (-24)
 
  @ jump to the newly found address
diff --git a/ports/sysdeps/arm/memcpy.S b/ports/sysdeps/arm/memcpy.S
index 98b9b47..98981ef 100644
--- a/ports/sysdeps/arm/memcpy.S
+++ b/ports/sysdeps/arm/memcpy.S
@@ -45,11 +45,11 @@
  * Endian independent macros for shifting bytes within registers.
  */
 #ifndef __ARMEB__
-#define pull            lsr
-#define push            lsl
+#define PULL            lsr
+#define PUSH            lsl
 #else
-#define pull            lsl
-#define push            lsr
+#define PULL            lsl
+#define PUSH            lsr
 #endif
 
  .text
@@ -58,7 +58,7 @@
 
 ENTRY(memcpy)
 
- stmfd sp!, {r0, r4, lr}
+ push {r0, r4, lr}
  cfi_adjust_cfa_offset (12)
  cfi_rel_offset (r4, 4)
  cfi_rel_offset (lr, 8)
@@ -74,7 +74,7 @@ ENTRY(memcpy)
  bne 10f
 
 1: subs r2, r2, #(28)
- stmfd sp!, {r5 - r8}
+ push {r5 - r8}
  cfi_adjust_cfa_offset (16)
  cfi_rel_offset (r5, 0)
  cfi_rel_offset (r6, 4)
@@ -131,7 +131,7 @@ ENTRY(memcpy)
 
  CALGN( bcs 2b )
 
-7: ldmfd sp!, {r5 - r8}
+7: pop {r5 - r8}
  cfi_adjust_cfa_offset (-16)
  cfi_restore (r5)
  cfi_restore (r6)
@@ -147,13 +147,13 @@ ENTRY(memcpy)
  strcsb ip, [r0]
 
 #if defined (__ARM_ARCH_4T__) && defined(__THUMB_INTERWORK__)
- ldmfd sp!, {r0, r4, lr}
+ pop {r0, r4, lr}
  cfi_adjust_cfa_offset (-12)
  cfi_restore (r4)
  cfi_restore (lr)
  bx      lr
 #else
- ldmfd sp!, {r0, r4, pc}
+ pop {r0, r4, pc}
 #endif
 
  cfi_restore_state
@@ -189,7 +189,7 @@ ENTRY(memcpy)
  CALGN( subcc r2, r2, ip )
  CALGN( bcc 15f )
 
-11: stmfd sp!, {r5 - r9}
+11: push {r5 - r9}
  cfi_adjust_cfa_offset (20)
  cfi_rel_offset (r5, 0)
  cfi_rel_offset (r6, 4)
@@ -206,30 +206,30 @@ ENTRY(memcpy)
 
 12: PLD( pld [r1, #124] )
 13: ldmia r1!, {r4, r5, r6, r7}
- mov r3, lr, pull #\pull
+ mov r3, lr, PULL #\pull
  subs r2, r2, #32
  ldmia r1!, {r8, r9, ip, lr}
- orr r3, r3, r4, push #\push
- mov r4, r4, pull #\pull
- orr r4, r4, r5, push #\push
- mov r5, r5, pull #\pull
- orr r5, r5, r6, push #\push
- mov r6, r6, pull #\pull
- orr r6, r6, r7, push #\push
- mov r7, r7, pull #\pull
- orr r7, r7, r8, push #\push
- mov r8, r8, pull #\pull
- orr r8, r8, r9, push #\push
- mov r9, r9, pull #\pull
- orr r9, r9, ip, push #\push
- mov ip, ip, pull #\pull
- orr ip, ip, lr, push #\push
+ orr r3, r3, r4, PUSH #\push
+ mov r4, r4, PULL #\pull
+ orr r4, r4, r5, PUSH #\push
+ mov r5, r5, PULL #\pull
+ orr r5, r5, r6, PUSH #\push
+ mov r6, r6, PULL #\pull
+ orr r6, r6, r7, PUSH #\push
+ mov r7, r7, PULL #\pull
+ orr r7, r7, r8, PUSH #\push
+ mov r8, r8, PULL #\pull
+ orr r8, r8, r9, PUSH #\push
+ mov r9, r9, PULL #\pull
+ orr r9, r9, ip, PUSH #\push
+ mov ip, ip, PULL #\pull
+ orr ip, ip, lr, PUSH #\push
  stmia r0!, {r3, r4, r5, r6, r7, r8, r9, ip}
  bge 12b
  PLD( cmn r2, #96 )
  PLD( bge 13b )
 
- ldmfd sp!, {r5 - r9}
+ pop {r5 - r9}
  cfi_adjust_cfa_offset (-20)
  cfi_restore (r5)
  cfi_restore (r6)
@@ -240,10 +240,10 @@ ENTRY(memcpy)
 14: ands ip, r2, #28
  beq 16f
 
-15: mov r3, lr, pull #\pull
+15: mov r3, lr, PULL #\pull
  ldr lr, [r1], #4
  subs ip, ip, #4
- orr r3, r3, lr, push #\push
+ orr r3, r3, lr, PUSH #\push
  str r3, [r0], #4
  bgt 15b
  CALGN( cmp r2, #0 )
diff --git a/ports/sysdeps/arm/memmove.S b/ports/sysdeps/arm/memmove.S
index 059ca7a..d9fa0e3 100644
--- a/ports/sysdeps/arm/memmove.S
+++ b/ports/sysdeps/arm/memmove.S
@@ -45,11 +45,11 @@
  * Endian independent macros for shifting bytes within registers.
  */
 #ifndef __ARMEB__
-#define pull            lsr
-#define push            lsl
+#define PULL            lsr
+#define PUSH            lsl
 #else
-#define pull            lsl
-#define push            lsr
+#define PULL            lsl
+#define PUSH            lsr
 #endif
 
  .text
@@ -73,7 +73,7 @@ ENTRY(memmove)
  bls HIDDEN_JUMPTARGET(memcpy)
 #endif
 
- stmfd sp!, {r0, r4, lr}
+ push {r0, r4, lr}
  cfi_adjust_cfa_offset (12)
  cfi_rel_offset (r4, 4)
  cfi_rel_offset (lr, 8)
@@ -91,7 +91,7 @@ ENTRY(memmove)
  bne 10f
 
 1: subs r2, r2, #(28)
- stmfd sp!, {r5 - r8}
+ push {r5 - r8}
  cfi_adjust_cfa_offset (16)
  cfi_rel_offset (r5, 0)
  cfi_rel_offset (r6, 4)
@@ -147,7 +147,7 @@ ENTRY(memmove)
 
  CALGN( bcs 2b )
 
-7: ldmfd sp!, {r5 - r8}
+7: pop {r5 - r8}
  cfi_adjust_cfa_offset (-16)
  cfi_restore (r5)
  cfi_restore (r6)
@@ -163,13 +163,13 @@ ENTRY(memmove)
  strcsb ip, [r0, #-1]
 
 #if defined (__ARM_ARCH_4T__) && defined (__THUMB_INTERWORK__)
- ldmfd sp!, {r0, r4, lr}
+ pop {r0, r4, lr}
  cfi_adjust_cfa_offset (-12)
  cfi_restore (r4)
  cfi_restore (lr)
  bx      lr
 #else
- ldmfd sp!, {r0, r4, pc}
+ pop {r0, r4, pc}
 #endif
 
  cfi_restore_state
@@ -204,7 +204,7 @@ ENTRY(memmove)
  CALGN( subcc r2, r2, ip )
  CALGN( bcc 15f )
 
-11: stmfd sp!, {r5 - r9}
+11: push {r5 - r9}
  cfi_adjust_cfa_offset (20)
  cfi_rel_offset (r5, 0)
  cfi_rel_offset (r6, 4)
@@ -221,30 +221,30 @@ ENTRY(memmove)
 
 12: PLD( pld [r1, #-128] )
 13: ldmdb   r1!, {r7, r8, r9, ip}
- mov     lr, r3, push #\push
+ mov     lr, r3, PUSH #\push
  subs    r2, r2, #32
  ldmdb   r1!, {r3, r4, r5, r6}
- orr     lr, lr, ip, pull #\pull
- mov     ip, ip, push #\push
- orr     ip, ip, r9, pull #\pull
- mov     r9, r9, push #\push
- orr     r9, r9, r8, pull #\pull
- mov     r8, r8, push #\push
- orr     r8, r8, r7, pull #\pull
- mov     r7, r7, push #\push
- orr     r7, r7, r6, pull #\pull
- mov     r6, r6, push #\push
- orr     r6, r6, r5, pull #\pull
- mov     r5, r5, push #\push
- orr     r5, r5, r4, pull #\pull
- mov     r4, r4, push #\push
- orr     r4, r4, r3, pull #\pull
+ orr     lr, lr, ip, PULL #\pull
+ mov     ip, ip, PUSH #\push
+ orr     ip, ip, r9, PULL #\pull
+ mov     r9, r9, PUSH #\push
+ orr     r9, r9, r8, PULL #\pull
+ mov     r8, r8, PUSH #\push
+ orr     r8, r8, r7, PULL #\pull
+ mov     r7, r7, PUSH #\push
+ orr     r7, r7, r6, PULL #\pull
+ mov     r6, r6, PUSH #\push
+ orr     r6, r6, r5, PULL #\pull
+ mov     r5, r5, PUSH #\push
+ orr     r5, r5, r4, PULL #\pull
+ mov     r4, r4, PUSH #\push
+ orr     r4, r4, r3, PULL #\pull
  stmdb   r0!, {r4 - r9, ip, lr}
  bge 12b
  PLD( cmn r2, #96 )
  PLD( bge 13b )
 
- ldmfd sp!, {r5 - r9}
+ pop {r5 - r9}
  cfi_adjust_cfa_offset (-20)
  cfi_restore (r5)
  cfi_restore (r6)
@@ -255,10 +255,10 @@ ENTRY(memmove)
 14: ands ip, r2, #28
  beq 16f
 
-15: mov     lr, r3, push #\push
+15: mov     lr, r3, PUSH #\push
  ldr r3, [r1, #-4]!
  subs ip, ip, #4
- orr lr, lr, r3, pull #\pull
+ orr lr, lr, r3, PULL #\pull
  str lr, [r0, #-4]!
  bgt 15b
  CALGN( cmp r2, #0 )
diff --git a/ports/sysdeps/arm/start.S b/ports/sysdeps/arm/start.S
index a1d15b8..0a57b0b 100644
--- a/ports/sysdeps/arm/start.S
+++ b/ports/sysdeps/arm/start.S
@@ -80,14 +80,14 @@ _start:
  mov lr, #0
 
  /* Pop argc off the stack and save a pointer to argv */
- ldr a2, [sp], #4
+ pop { a2 }
  mov a3, sp
 
  /* Push stack limit */
- str a3, [sp, #-4]!
+ push { a3 }
 
  /* Push rtld_fini */
- str a1, [sp, #-4]!
+ push { a1 }
 
 #ifdef SHARED
  ldr sl, .L_GOT
@@ -97,7 +97,7 @@ _start:
  ldr ip, .L_GOT+4 /* __libc_csu_fini */
  ldr ip, [sl, ip]
 
- str ip, [sp, #-4]! /* Push __libc_csu_fini */
+ push { ip } /* Push __libc_csu_fini */
 
  ldr a4, .L_GOT+8 /* __libc_csu_init */
  ldr a4, [sl, a4]
@@ -113,7 +113,7 @@ _start:
  ldr ip, =__libc_csu_fini
 
  /* Push __libc_csu_fini */
- str ip, [sp, #-4]!
+ push { ip }
 
  /* Set up the other arguments in registers */
  ldr a1, =main
diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index d855ceb..d74a328 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -80,7 +80,7 @@
 /* Call __gnu_mcount_nc if GCC >= 4.4.  */
 #if __GNUC_PREREQ(4,4)
 #define CALL_MCOUNT \
- str lr,[sp, #-4]!; \
+ push {lr}; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (lr, 0); \
  bl PLTJMP(mcount); \
@@ -88,11 +88,11 @@
  cfi_restore (lr)
 #else /* else call _mcount */
 #define CALL_MCOUNT \
- str lr,[sp, #-4]!; \
+ push {lr}; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (lr, 0); \
  bl PLTJMP(mcount); \
- ldr lr, [sp], #4; \
+ pops {lr}; \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (lr)
 #endif
diff --git a/ports/sysdeps/unix/sysv/linux/arm/____longjmp_chk.S b/ports/sysdeps/unix/sysv/linux/arm/____longjmp_chk.S
index 29edec6..6ee7a1a 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/____longjmp_chk.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/____longjmp_chk.S
@@ -53,7 +53,7 @@ longjmp_msg:
  cfi_remember_state; \
  cmp sp, reg; \
  bls .Lok; \
- str r7, [sp, #-4]!; \
+ push { r7 }; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (r7, 0); \
  mov r5, r0; \
@@ -79,7 +79,7 @@ longjmp_msg:
 .Lfail: \
  add sp, sp, #12; \
  cfi_adjust_cfa_offset (-12); \
- ldr r7, [sp], #4; \
+ pop { r7 }; \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (r7); \
  CALL_FAIL \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/clone.S b/ports/sysdeps/unix/sysv/linux/arm/clone.S
index 1bc5eab..3edebd2 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/clone.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/clone.S
@@ -49,7 +49,7 @@ ENTRY(__clone)
  mov ip, r2
 #endif
  @ new sp is already in r1
- stmfd sp!, {r4, r7}
+ push {r4, r7}
  cfi_adjust_cfa_offset (8)
  cfi_rel_offset (r4, 0)
  cfi_rel_offset (r7, 4)
@@ -61,7 +61,7 @@ ENTRY(__clone)
  cfi_endproc
  cmp r0, #0
  beq 1f
- ldmfd sp!, {r4, r7}
+ pop {r4, r7}
  blt PLTJMP(C_SYMBOL_NAME(__syscall_error))
  RETINSTR(, lr)
 
diff --git a/ports/sysdeps/unix/sysv/linux/arm/mmap.S b/ports/sysdeps/unix/sysv/linux/arm/mmap.S
index 68560b0..06b737e 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/mmap.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/mmap.S
@@ -23,11 +23,11 @@
 
 ENTRY (__mmap)
  /* shuffle args */
- str r5, [sp, #-4]!
+ push { r5 }
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r5, 0)
  ldr r5, [sp, #8]
- str r4, [sp, #-4]!
+ push { r4 }
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r4, 0)
  cfi_remember_state
@@ -43,10 +43,10 @@ ENTRY (__mmap)
 
  /* restore registers */
 2:
- ldr r4, [sp], #4
+ pop { r4 }
  cfi_adjust_cfa_offset (-4)
  cfi_restore (r4)
- ldr r5, [sp], #4
+ pop { r5 }
  cfi_adjust_cfa_offset (-4)
  cfi_restore (r5)
 
diff --git a/ports/sysdeps/unix/sysv/linux/arm/mmap64.S b/ports/sysdeps/unix/sysv/linux/arm/mmap64.S
index dcbab3a..d039129 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/mmap64.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/mmap64.S
@@ -34,11 +34,11 @@
  .text
 ENTRY (__mmap64)
  ldr ip, [sp, $LOW_OFFSET]
- str r5, [sp, #-4]!
+ push { r5 }
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r5, 0)
  ldr r5, [sp, $HIGH_OFFSET]
- str r4, [sp, #-4]!
+ push { r4 }
  cfi_adjust_cfa_offset (4)
  cfi_rel_offset (r4, 0)
  cfi_remember_state
@@ -51,7 +51,7 @@ ENTRY (__mmap64)
  orr r5, ip, r5, lsl $20 @ compose page offset
  DO_CALL (mmap2, 0)
  cmn r0, $4096
- ldmfd sp!, {r4, r5}
+ pop {r4, r5}
  cfi_adjust_cfa_offset (-8)
  cfi_restore (r4)
  cfi_restore (r5)
@@ -62,7 +62,7 @@ ENTRY (__mmap64)
  cfi_restore_state
 .Linval:
  mov r0, $-EINVAL
- ldmfd sp!, {r4, r5}
+ pop {r4, r5}
  cfi_adjust_cfa_offset (-8)
  cfi_restore (r4)
  cfi_restore (r5)
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
index d5e666b..ac094df 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/sysdep-cancel.h
@@ -76,19 +76,19 @@
 
 # define DOCARGS_0 \
  .save {r7}; \
- str lr, [sp, #-4]!; \
+ push {lr}; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (lr, 0); \
  .save {lr}
 # define UNDOCARGS_0
 # define RESTORE_LR_0 \
- ldr lr, [sp], #4; \
+ pop {lr}; \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (lr)
 
 # define DOCARGS_1 \
  .save {r7}; \
- stmfd sp!, {r0, r1, lr}; \
+ push {r0, r1, lr}; \
  cfi_adjust_cfa_offset (12); \
  cfi_rel_offset (lr, 8); \
  .save {lr}; \
@@ -102,13 +102,13 @@
 
 # define DOCARGS_2 \
  .save {r7}; \
- stmfd sp!, {r0, r1, lr}; \
+ push {r0, r1, lr}; \
  cfi_adjust_cfa_offset (12); \
  cfi_rel_offset (lr, 8); \
  .save {lr}; \
  .pad #8
 # define UNDOCARGS_2 \
- ldmfd sp!, {r0, r1}; \
+ pop {r0, r1}; \
  cfi_adjust_cfa_offset (-8); \
  RESTART_UNWIND
 # define RESTORE_LR_2 \
@@ -116,13 +116,13 @@
 
 # define DOCARGS_3 \
  .save {r7}; \
- stmfd sp!, {r0, r1, r2, r3, lr}; \
+ push {r0, r1, r2, r3, lr}; \
  cfi_adjust_cfa_offset (20); \
  cfi_rel_offset (lr, 16); \
  .save {lr}; \
  .pad #16
 # define UNDOCARGS_3 \
- ldmfd sp!, {r0, r1, r2, r3}; \
+ pop {r0, r1, r2, r3}; \
  cfi_adjust_cfa_offset (-16); \
  RESTART_UNWIND
 # define RESTORE_LR_3 \
@@ -130,13 +130,13 @@
 
 # define DOCARGS_4 \
  .save {r7}; \
- stmfd sp!, {r0, r1, r2, r3, lr}; \
+ push {r0, r1, r2, r3, lr}; \
  cfi_adjust_cfa_offset (20); \
  cfi_rel_offset (lr, 16); \
  .save {lr}; \
  .pad #16
 # define UNDOCARGS_4 \
- ldmfd sp!, {r0, r1, r2, r3}; \
+ pop {r0, r1, r2, r3}; \
  cfi_adjust_cfa_offset (-16); \
  RESTART_UNWIND
 # define RESTORE_LR_4 \
@@ -145,13 +145,13 @@
 /* r4 is only stmfd'ed for correct stack alignment.  */
 # define DOCARGS_5 \
  .save {r4, r7}; \
- stmfd sp!, {r0, r1, r2, r3, r4, lr}; \
+ push {r0, r1, r2, r3, r4, lr}; \
  cfi_adjust_cfa_offset (24); \
  cfi_rel_offset (lr, 20); \
  .save {lr}; \
  .pad #20
 # define UNDOCARGS_5 \
- ldmfd sp!, {r0, r1, r2, r3}; \
+ pop {r0, r1, r2, r3}; \
  cfi_adjust_cfa_offset (-16); \
  .fnend; \
  .fnstart; \
@@ -159,20 +159,20 @@
  .save {lr}; \
  .pad #4
 # define RESTORE_LR_5 \
- ldmfd sp!, {r4, lr}; \
+ pop {r4, lr}; \
  cfi_adjust_cfa_offset (-8); \
  /* r4 will be marked as restored later.  */ \
  cfi_restore (lr)
 
 # define DOCARGS_6 \
  .save {r4, r5, r7}; \
- stmfd sp!, {r0, r1, r2, r3, lr}; \
+ push {r0, r1, r2, r3, lr}; \
  cfi_adjust_cfa_offset (20); \
  cfi_rel_offset (lr, 16); \
  .save {lr}; \
  .pad #16
 # define UNDOCARGS_6 \
- ldmfd sp!, {r0, r1, r2, r3}; \
+ pop {r0, r1, r2, r3}; \
  cfi_adjust_cfa_offset (-16); \
  .fnend; \
  .fnstart; \
@@ -213,13 +213,13 @@ extern int __local_multiple_threads attribute_hidden;
    header.multiple_threads) == 0, 1)
 #  else
 #   define SINGLE_THREAD_P \
- stmfd sp!, {r0, lr}; \
+ push {r0, lr}; \
  cfi_adjust_cfa_offset (8); \
  cfi_rel_offset (lr, 4); \
  GET_TLS(lr); \
  NEGOFF_ADJ_BASE(r0, MULTIPLE_THREADS_OFFSET); \
  ldr ip, NEGOFF_OFF1(r0, MULTIPLE_THREADS_OFFSET); \
- ldmfd sp!, {r0, lr}; \
+ pop {r0, lr}; \
  cfi_adjust_cfa_offset (-8); \
  cfi_restore (lr); \
  teq ip, #0
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-forcedunwind.c b/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-forcedunwind.c
index caa6a26..108924d 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-forcedunwind.c
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-forcedunwind.c
@@ -93,7 +93,7 @@ asm (
 "_Unwind_Resume:\n"
 " .cfi_sections .debug_frame\n"
 " " CFI_STARTPROC "\n"
-" stmfd sp!, {r4, r5, r6, lr}\n"
+" push {r4, r5, r6, lr}\n"
 " " CFI_ADJUST_CFA_OFFSET (16)" \n"
 " " CFI_REL_OFFSET (r4, 0) "\n"
 " " CFI_REL_OFFSET (r5, 4) "\n"
@@ -108,7 +108,7 @@ asm (
 " cmp r3, #0\n"
 " beq 4f\n"
 "5: mov r0, r6\n"
-" ldmfd sp!, {r4, r5, r6, lr}\n"
+" pop {r4, r5, r6, lr}\n"
 " " CFI_ADJUST_CFA_OFFSET (-16) "\n"
 " " CFI_RESTORE (r4) "\n"
 " " CFI_RESTORE (r5) "\n"
diff --git a/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-resume.c b/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-resume.c
index 1211599..d155ea7 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-resume.c
+++ b/ports/sysdeps/unix/sysv/linux/arm/nptl/unwind-resume.c
@@ -56,7 +56,7 @@ asm (
 "_Unwind_Resume:\n"
 " .cfi_sections .debug_frame\n"
 " " CFI_STARTPROC "\n"
-" stmfd sp!, {r4, r5, r6, lr}\n"
+" push {r4, r5, r6, lr}\n"
 " " CFI_ADJUST_CFA_OFFSET (16)" \n"
 " " CFI_REL_OFFSET (r4, 0) "\n"
 " " CFI_REL_OFFSET (r5, 4) "\n"
@@ -71,7 +71,7 @@ asm (
 " cmp r3, #0\n"
 " beq 4f\n"
 "5: mov r0, r6\n"
-" ldmfd sp!, {r4, r5, r6, lr}\n"
+" pop {r4, r5, r6, lr}\n"
 " " CFI_ADJUST_CFA_OFFSET (-16) "\n"
 " " CFI_RESTORE (r4) "\n"
 " " CFI_RESTORE (r5) "\n"
diff --git a/ports/sysdeps/unix/sysv/linux/arm/syscall.S b/ports/sysdeps/unix/sysv/linux/arm/syscall.S
index 665ecb4..bdd5a52 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/syscall.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/syscall.S
@@ -23,7 +23,7 @@
 
 ENTRY (syscall)
  mov ip, sp
- stmfd sp!, {r4, r5, r6, r7}
+ push {r4, r5, r6, r7}
  cfi_adjust_cfa_offset (16)
  cfi_rel_offset (r4, 0)
  cfi_rel_offset (r5, 4)
@@ -35,7 +35,7 @@ ENTRY (syscall)
  mov r2, r3
  ldmfd ip, {r3, r4, r5, r6}
  swi 0x0
- ldmfd sp!, {r4, r5, r6, r7}
+ pop {r4, r5, r6, r7}
  cfi_adjust_cfa_offset (-16)
  cfi_restore (r4)
  cfi_restore (r5)
diff --git a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
index 01d8123..39872b8 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
@@ -147,23 +147,22 @@ __local_syscall_error: \
 # else
 #  if defined(__ARM_ARCH_4T__) && defined(__THUMB_INTERWORK__)
 #   define POP_PC \
-  ldr lr, [sp], #4; \
+  pop { lr }; \
   cfi_adjust_cfa_offset (-4); \
   cfi_restore (lr); \
   bx lr
 #  else
-#   define POP_PC  \
-  ldr pc, [sp], #4
+#   define POP_PC  pop { pc }
 #  endif
 #  define SYSCALL_ERROR_HANDLER \
 __local_syscall_error: \
- str lr, [sp, #-4]!; \
+ push { lr }; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (lr, 0); \
- str r0, [sp, #-4]!; \
+ push { r0 };     \
  cfi_adjust_cfa_offset (4); \
  bl PLTJMP(C_SYMBOL_NAME(__errno_location)); \
- ldr r1, [sp], #4; \
+ pop { r1 }; \
  cfi_adjust_cfa_offset (-4); \
  rsb r1, r1, #0; \
  str r1, [r0]; \
@@ -230,7 +229,7 @@ __local_syscall_error: \
 #undef  DOARGS_0
 #define DOARGS_0 \
  .fnstart; \
- str r7, [sp, #-4]!; \
+ push { r7 }; \
  cfi_adjust_cfa_offset (4); \
  cfi_rel_offset (r7, 0); \
  .save { r7 }
@@ -245,7 +244,7 @@ __local_syscall_error: \
 #undef  DOARGS_5
 #define DOARGS_5 \
  .fnstart; \
- stmfd sp!, {r4, r7}; \
+ push {r4, r7}; \
  cfi_adjust_cfa_offset (8); \
  cfi_rel_offset (r4, 0); \
  cfi_rel_offset (r7, 4); \
@@ -255,7 +254,7 @@ __local_syscall_error: \
 #define DOARGS_6 \
  .fnstart; \
  mov ip, sp; \
- stmfd sp!, {r4, r5, r7}; \
+ push {r4, r5, r7}; \
  cfi_adjust_cfa_offset (12); \
  cfi_rel_offset (r4, 0); \
  cfi_rel_offset (r5, 4); \
@@ -266,7 +265,7 @@ __local_syscall_error: \
 #define DOARGS_7 \
  .fnstart; \
  mov ip, sp; \
- stmfd sp!, {r4, r5, r6, r7}; \
+ push {r4, r5, r6, r7}; \
  cfi_adjust_cfa_offset (16); \
  cfi_rel_offset (r4, 0); \
  cfi_rel_offset (r5, 4); \
@@ -277,7 +276,7 @@ __local_syscall_error: \
 
 #undef  UNDOARGS_0
 #define UNDOARGS_0 \
- ldr r7, [sp], #4; \
+ pop {r7}; \
  cfi_adjust_cfa_offset (-4); \
  cfi_restore (r7); \
  .fnend
@@ -291,14 +290,14 @@ __local_syscall_error: \
 #define UNDOARGS_4 UNDOARGS_0
 #undef  UNDOARGS_5
 #define UNDOARGS_5 \
- ldmfd sp!, {r4, r7}; \
+ pop {r4, r7}; \
  cfi_adjust_cfa_offset (-8); \
  cfi_restore (r4); \
  cfi_restore (r7); \
  .fnend
 #undef  UNDOARGS_6
 #define UNDOARGS_6 \
- ldmfd sp!, {r4, r5, r7}; \
+ pop {r4, r5, r7}; \
  cfi_adjust_cfa_offset (-12); \
  cfi_restore (r4); \
  cfi_restore (r5); \
@@ -306,7 +305,7 @@ __local_syscall_error: \
  .fnend
 #undef  UNDOARGS_7
 #define UNDOARGS_7 \
- ldmfd sp!, {r4, r5, r6, r7}; \
+ pop {r4, r5, r6, r7}; \
  cfi_adjust_cfa_offset (-16); \
  cfi_restore (r4); \
  cfi_restore (r5); \
diff --git a/ports/sysdeps/unix/sysv/linux/arm/vfork.S b/ports/sysdeps/unix/sysv/linux/arm/vfork.S
index ae931f7..128a640 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/vfork.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/vfork.S
@@ -37,7 +37,7 @@ ENTRY (__vfork)
  mov ip, r7
  cfi_register (r7, ip)
  .fnstart
- str r7, [sp, #-4]!
+ push { r7 }
  cfi_adjust_cfa_offset (4)
  .save { r7 }
  ldr r7, =SYS_ify (vfork)
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 06/14] arm: Delete LOADREGS macro

Richard Henderson
In reply to this post by Richard Henderson
There was only one user.  It's "condition" argument was used
for "ia" rather than an actual condition.  The apcs26 syntax
is almost certainly not needed, given current binutils requirements.
---
        * sysdeps/arm/__longjmp.S (__longjmp): Use ldmia insn directly.
        * sysdeps/arm/sysdep.h (LOADREGS): Remove.
---
 ports/sysdeps/arm/__longjmp.S | 2 +-
 ports/sysdeps/arm/sysdep.h    | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/ports/sysdeps/arm/__longjmp.S b/ports/sysdeps/arm/__longjmp.S
index a3a2a8a..1d5d56b 100644
--- a/ports/sysdeps/arm/__longjmp.S
+++ b/ports/sysdeps/arm/__longjmp.S
@@ -37,7 +37,7 @@ ENTRY (__longjmp)
  cfi_undefined (r4)
  CHECK_SP (r4)
 #endif
- LOADREGS(ia, ip!, {v1-v6, sl, fp, sp, lr})
+ ldmia ip!, {v1-v6, sl, fp, sp, lr}
  cfi_restore (v1)
  cfi_restore (v2)
  cfi_restore (v3)
diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index d74a328..5f6c3f2 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -35,8 +35,6 @@
 
 /* APCS-32 doesn't preserve the condition codes across function call. */
 #ifdef __APCS_32__
-#define LOADREGS(cond, base, reglist...)\
- ldm##cond base,reglist
 #ifdef __USE_BX__
 #define RETINSTR(cond, reg) \
  bx##cond reg
@@ -49,8 +47,6 @@
  mov pc, _reg
 #endif
 #else  /* APCS-26 */
-#define LOADREGS(cond, base, reglist...)\
- ldm##cond base,reglist^
 #define RETINSTR(cond, reg) \
  mov##cond##s pc, reg
 #define DO_RET(_reg) \
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 07/14] arm: Commonize BX conditionals

Richard Henderson
In reply to this post by Richard Henderson
Add BLX macro in addition and use it where appropriate.
---
        * sysdeps/arm/sysdep.h (BX, BXC, BLX): New macros.
        (DO_RET): Use BX.
        (RETINSTR): Use BXC.
        * sysdeps/arm/dl-tlsdesc.S (BX): Remove.
        * sysdeps/arm/dl-trampoline.S (BX): Remove.
        (_dl_runtime_profile): Use BLX.
---
 ports/sysdeps/arm/dl-tlsdesc.S    |  6 ------
 ports/sysdeps/arm/dl-trampoline.S |  9 +--------
 ports/sysdeps/arm/sysdep.h        | 29 +++++++++++++----------------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/ports/sysdeps/arm/dl-tlsdesc.S b/ports/sysdeps/arm/dl-tlsdesc.S
index 3a956de..aa3db80 100644
--- a/ports/sysdeps/arm/dl-tlsdesc.S
+++ b/ports/sysdeps/arm/dl-tlsdesc.S
@@ -20,12 +20,6 @@
 #include <tls.h>
 #include "tlsdesc.h"
 
-#ifdef __USE_BX__
-  #define BX(x) bx x
-#else
-  #define BX(x) mov pc, x
-#endif
-
  .text
  @ emit debug information with cfi
  @ use arm-specific pseudos for unwinding itself
diff --git a/ports/sysdeps/arm/dl-trampoline.S b/ports/sysdeps/arm/dl-trampoline.S
index f2d1679..9366976 100644
--- a/ports/sysdeps/arm/dl-trampoline.S
+++ b/ports/sysdeps/arm/dl-trampoline.S
@@ -21,12 +21,6 @@
 #include <sysdep.h>
 #include <libc-symbols.h>
 
-#if defined(__USE_BX__)
-#define BX(x) bx x
-#else
-#define BX(x) mov pc, x
-#endif
-
  .text
  .globl _dl_runtime_resolve
  .type _dl_runtime_resolve, #function
@@ -192,8 +186,7 @@ _dl_runtime_profile:
  add ip, r7, #72
  ldmia ip, {r0-r3}
  ldr ip, [r7, #264]
- mov lr, pc
- BX(ip)
+ BLX(ip)
  stmia r7, {r0-r3}
 
  @ Call pltexit.
diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index 5f6c3f2..84313fe 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -33,26 +33,23 @@
 
 #define PLTJMP(_x) _x##(PLT)
 
-/* APCS-32 doesn't preserve the condition codes across function call. */
-#ifdef __APCS_32__
 #ifdef __USE_BX__
-#define RETINSTR(cond, reg) \
- bx##cond reg
-#define DO_RET(_reg) \
- bx _reg
+# define BX(R) bx R
+# define BXC(C, R) bx##C R
+# ifdef __ARM_ARCH_4T__
+#  define BLX(R) mov lr, pc; bx R
+# else
+#  define BLX(R) blx R
+# endif
 #else
-#define RETINSTR(cond, reg) \
- mov##cond pc, reg
-#define DO_RET(_reg) \
- mov pc, _reg
-#endif
-#else  /* APCS-26 */
-#define RETINSTR(cond, reg) \
- mov##cond##s pc, reg
-#define DO_RET(_reg) \
- movs pc, _reg
+# define BX(R) mov pc, R
+# define BXC(C, R) mov##C pc, R
+# define BLX(R) mov lr, pc; mov pc, R
 #endif
 
+#define DO_RET(R) BX(R)
+#define RETINSTR(C, R) BXC(C, R)
+
 /* Define an entry point visible from C.  */
 #define ENTRY(name) \
  .globl C_SYMBOL_NAME(name); \
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 08/14] arm: Unless arm4t, pop return address directly into pc

Richard Henderson
In reply to this post by Richard Henderson
Unless we're trying old interworking, there's no point restoring to
LR first.  Everthing from armv5 on handles pop as an interworking jump.
---
        * sysdeps/arm/arm-mcount.S (_mcount): Use pop into pc unless
        __ARM_ARCH_4T__ and __THUMB_INTERWORK__.
        * sysdeps/arm/dl-tlsdesc.S (_dl_tlsdesc_dynamic): Likewise.
---
 ports/sysdeps/arm/arm-mcount.S | 6 +++---
 ports/sysdeps/arm/dl-tlsdesc.S | 9 +++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/ports/sysdeps/arm/arm-mcount.S b/ports/sysdeps/arm/arm-mcount.S
index b6e5ec7..8ad0779 100644
--- a/ports/sysdeps/arm/arm-mcount.S
+++ b/ports/sysdeps/arm/arm-mcount.S
@@ -82,9 +82,7 @@ ENTRY(_mcount)
  ldrne r0, [r0, #-4]
  movsne r1, lr
  blne __mcount_internal
-#ifdef __thumb2__
- pop {r0, r1, r2, r3, fp, pc}
-#else
+#if defined (__ARM_ARCH_4T__) && defined (__THUMB_INTERWORK__)
  pop {r0, r1, r2, r3, fp, lr}
  cfi_adjust_cfa_offset (-24)
  cfi_restore (r0)
@@ -94,6 +92,8 @@ ENTRY(_mcount)
  cfi_restore (fp)
  cfi_restore (lr)
  bx lr
+#else
+ pop {r0, r1, r2, r3, fp, pc}
 #endif
 END(_mcount)
 
diff --git a/ports/sysdeps/arm/dl-tlsdesc.S b/ports/sysdeps/arm/dl-tlsdesc.S
index aa3db80..4635415 100644
--- a/ports/sysdeps/arm/dl-tlsdesc.S
+++ b/ports/sysdeps/arm/dl-tlsdesc.S
@@ -109,13 +109,18 @@ _dl_tlsdesc_dynamic:
 1: mov r0, r1
  bl __tls_get_addr
  rsb r0, r4, r0
-2: pop {r2,r3,r4, lr}
+2:
+#if defined (__ARM_ARCH_4T__) && defined (__THUMB_INTERWORK__)
+ pop {r2,r3,r4, lr}
  cfi_adjust_cfa_offset (-16)
  cfi_restore (lr)
  cfi_restore (r4)
  cfi_restore (r3)
  cfi_restore (r2)
- BX      (lr)
+ bx lr
+#else
+ pop {r2,r3,r4, pc}
+#endif
  .fnend
  cfi_endproc
  .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 09/14] arm: Tidy architecture selection

Richard Henderson
In reply to this post by Richard Henderson
---
        * sysdeps/arm/sysdep.h (__ARM_ARCH): New macro.
        (ARCH_HAS_BX, ARCH_HAS_BLX, ARCH_HAS_T2): New macros.
        (BX): Select on ARCH_HAS_BX instead of __USE_BX__.
        (BLX): Select on ARCH_HAS_BLX instead of __ARM_ARCH_4T__.
        * ports/sysdeps/arm/dl-machine.h (BX): Select on ARCH_HAS_BX
        instead of __USE_BX__.
---
 ports/sysdeps/arm/dl-machine.h |  2 +-
 ports/sysdeps/arm/sysdep.h     | 41 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/ports/sysdeps/arm/dl-machine.h b/ports/sysdeps/arm/dl-machine.h
index 30ad46c..5a424f8 100644
--- a/ports/sysdeps/arm/dl-machine.h
+++ b/ports/sysdeps/arm/dl-machine.h
@@ -136,7 +136,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
   return lazy;
 }
 
-#if defined(__USE_BX__)
+#if defined(ARCH_HAS_BX)
 #define BX(x) "bx\t" #x
 #else
 #define BX(x) "mov\tpc, " #x
diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index 84313fe..c09e680 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -19,9 +19,36 @@
 #include <sysdeps/generic/sysdep.h>
 #include <features.h>
 
-#if (!defined (__ARM_ARCH_2__) && !defined (__ARM_ARCH_3__) \
-     && !defined (__ARM_ARCH_3M__) && !defined (__ARM_ARCH_4__))
-# define __USE_BX__
+/* The __ARM_ARCH define is provided by gcc 4.8.  Construct it otherwise.  */
+#ifndef __ARM_ARCH
+# ifdef __ARM_ARCH_2__
+#  define __ARM_ARCH 2
+# elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+#  define __ARM_ARCH 3
+# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#  define __ARM_ARCH 4
+# elif defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) \
+       defined(__ARM_ARCH_5TEJ__)
+#  define __ARM_ARCH 5
+# elif defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6ZK__) \
+       defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__)
+#  define __ARM_ARCH 6
+# elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) \
+       defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
+#  define __ARM_ARCH 7
+# else
+#  error unknown arm architecture
+# endif
+#endif
+
+#if __ARM_ARCH > 4 || defined(__ARM_ARCH_4T__)
+# define ARCH_HAS_BX
+#endif
+#if __ARM_ARCH > 4
+# define ARCH_HAS_BLX
+#endif
+#if __ARM_ARCH > 6 || defined(__ARM_ARCH_6T2__)
+# define ARCH_HAS_T2
 #endif
 
 #ifdef __ASSEMBLER__
@@ -33,13 +60,13 @@
 
 #define PLTJMP(_x) _x##(PLT)
 
-#ifdef __USE_BX__
+#ifdef ARCH_HAS_BX
 # define BX(R) bx R
 # define BXC(C, R) bx##C R
-# ifdef __ARM_ARCH_4T__
-#  define BLX(R) mov lr, pc; bx R
-# else
+# ifdef ARCH_HAS_BLX
 #  define BLX(R) blx R
+# else
+#  define BLX(R) mov lr, pc; bx R
 # endif
 #else
 # define BX(R) mov pc, R
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 10/14] arm: Implement hard-tp for GET_TLS

Richard Henderson
In reply to this post by Richard Henderson
---
        * sysdeps/arm/sysdep.h (ARCH_HAS_HARD_TP): New macro.
        (GET_TLS): Use hard-tp if ARCH_HAS_HARD_TP.
        * sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S: Likewise.
        * sysdeps/unix/sysv/linux/arm/sysdep.h (GET_TLS): Don't override
        the default definition if ARCH_HAS_HARD_TP.
---
 ports/sysdeps/arm/sysdep.h                        | 14 +++++++++++---
 ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S |  5 ++++-
 ports/sysdeps/unix/sysv/linux/arm/sysdep.h        | 16 +++++++++-------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/ports/sysdeps/arm/sysdep.h b/ports/sysdeps/arm/sysdep.h
index c09e680..03739a4 100644
--- a/ports/sysdeps/arm/sysdep.h
+++ b/ports/sysdeps/arm/sysdep.h
@@ -47,6 +47,9 @@
 #if __ARM_ARCH > 4
 # define ARCH_HAS_BLX
 #endif
+#if __ARM_ARCH > 6 || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__)
+# define ARCH_HAS_HARD_TP
+#endif
 #if __ARM_ARCH > 6 || defined(__ARM_ARCH_6T2__)
 # define ARCH_HAS_T2
 #endif
@@ -184,10 +187,14 @@
 /* Helper to get the TLS base pointer.  The interface is that TMP is a
    register that may be used to hold the LR, if necessary.  TMP may be
    LR itself to indicate that LR need not be saved.  The base pointer
-   is returned in R0.  Only R0 and TMP are modified.
+   is returned in R0.  Only R0 and TMP are modified.  */
 
-   At this generic level we have no tricks to pull.  Call the ABI routine.  */
-# define GET_TLS(TMP) \
+# ifdef ARCH_HAS_HARD_TP
+/* If the cpu has cp15 available, use it.  */
+#  define GET_TLS(TMP) mrc p15, 0, r0, c13, c0, 3
+# else
+/* At this generic level we have no tricks to pull.  Call the ABI routine.  */
+#  define GET_TLS(TMP) \
  push { r1, r2, r3, lr }; \
  cfi_remember_state; \
  cfi_adjust_cfa_offset (16); \
@@ -198,6 +205,7 @@
  bl __aeabi_read_tp; \
  pop { r1, r2, r3, lr }; \
  cfi_restore_state
+# endif /* ARCH_HAS_HARD_TP */
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S b/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
index ecdc322..21e3229 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
+++ b/ports/sysdeps/unix/sysv/linux/arm/aeabi_read_tp.S
@@ -41,7 +41,10 @@
 
  .hidden __aeabi_read_tp
 ENTRY (__aeabi_read_tp)
-#ifdef __thumb2__
+#ifdef ARCH_HAS_HARD_TP
+ mrc p15, 0, r0, c13, c0, 3
+ bx lr
+#elif defined(__thumb2__)
  movw r0, #0x0fe0
  movt r0, #0xffff
  bx r0
diff --git a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
index 39872b8..89fea7a 100644
--- a/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
+++ b/ports/sysdeps/unix/sysv/linux/arm/sysdep.h
@@ -45,26 +45,27 @@
 
 #ifdef __ASSEMBLER__
 
+#ifndef ARCH_HAS_HARD_TP
 /* Internal macro calling the linux kernel kuser_get_tls helper.
    Note that in thumb mode, a constant pool break is often out of range, so
    we always expand the constant inline.  */
-#ifdef __thumb2__
-# define GET_TLS_BODY \
+# ifdef __thumb2__
+#  define GET_TLS_BODY \
  movw r0, #0x0fe0; \
  movt r0, #0xffff; \
  blx r0
-#else
-# define GET_TLS_BODY \
+# else
+#  define GET_TLS_BODY \
  mov r0, #0xffff0fff; /* Point to the high page.  */ \
  mov lr, pc; /* Save our return address.  */ \
  sub pc, r0, #31 /* Jump to the TLS entry.  */
-#endif
+# endif
 
 /* Helper to get the TLS base pointer.  Save LR in TMP, return in R0,
    and no other registers clobbered.  TMP may be LR itself to indicate
    that no save is necessary.  */
-#undef GET_TLS
-#define GET_TLS(TMP) \
+# undef GET_TLS
+# define GET_TLS(TMP) \
   .ifnc TMP, lr; \
  mov TMP, lr; \
  cfi_register (lr, TMP); \
@@ -74,6 +75,7 @@
   .else; \
  GET_TLS_BODY; \
   .endif
+#endif /* ARCH_HAS_HARD_TP */
 
 /* Linux uses a negative return value to indicate syscall errors,
    unlike most Unices, which use the condition codes' carry flag.
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 11/14] arm: Add optimized ffs for armv6t2

Richard Henderson
In reply to this post by Richard Henderson
---
        * sysdeps/arm/armv6t2/ffs.S: New file.
        * sysdeps/arm/armv6t2/ffsll.S: New file.
---
 ports/sysdeps/arm/armv6t2/ffs.S   | 35 +++++++++++++++++++++++++++
 ports/sysdeps/arm/armv6t2/ffsll.S | 50 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 ports/sysdeps/arm/armv6t2/ffs.S
 create mode 100644 ports/sysdeps/arm/armv6t2/ffsll.S

diff --git a/ports/sysdeps/arm/armv6t2/ffs.S b/ports/sysdeps/arm/armv6t2/ffs.S
new file mode 100644
index 0000000..b2c88b9
--- /dev/null
+++ b/ports/sysdeps/arm/armv6t2/ffs.S
@@ -0,0 +1,35 @@
+/* ffs -- find first set bit in an int, from least significant end.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (__ffs)
+ cmp r0, #0
+ rbit r0, r0
+ itt ne
+ clzne r0, r0
+ addne r0, r0, #1
+ bx lr
+END (__ffs)
+
+weak_alias (__ffs, ffs)
+weak_alias (__ffs, ffsl)
+libc_hidden_builtin_def (ffs)
diff --git a/ports/sysdeps/arm/armv6t2/ffsll.S b/ports/sysdeps/arm/armv6t2/ffsll.S
new file mode 100644
index 0000000..e49c70f
--- /dev/null
+++ b/ports/sysdeps/arm/armv6t2/ffsll.S
@@ -0,0 +1,50 @@
+/* ffsll -- find first set bit in a long long, from least significant end.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+ENTRY (ffsll)
+ @ If low part is 0, operate on the high part.  Ensure that the
+ @ word on which we operate is in r0.  Set r2 to the bit offset
+ @ of the word being considered.  Set the flags for the word
+ @ being operated on.
+#ifdef __ARMEL__
+ cmp r0, #0
+ itee ne
+ movne r2, #0
+ moveq r2, #32
+ movseq r0, r1
+#else
+ cmp r1, #0
+ ittee ne
+ movne r2, #0
+ movne r0, r1
+ moveq r2, #32
+ cmpeq r0, #0
+#endif
+ @ Perform the ffs on r0.
+ rbit r0, r0
+ ittt ne
+ clzne r0, r0
+ addne r2, r2, #1
+ addne r0, r0, r2
+ bx lr
+END (ffsll)
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 12/14] arm: Add optimized addmul_1

Richard Henderson
In reply to this post by Richard Henderson
Written from scratch rather than copied from GMP, due to GPL 2.1 vs
GPL 3, but tested with the GMP testsuite.

This is 25% faster than the generic code as measured on Cortex-A15,
and the same speed as GMP on the same core.  It's probably slower
than GMP on the A8 and A9 cores though.
---
        * sysdeps/arm/addmul_1.S: New file.
---
 ports/sysdeps/arm/addmul_1.S | 66 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 ports/sysdeps/arm/addmul_1.S

diff --git a/ports/sysdeps/arm/addmul_1.S b/ports/sysdeps/arm/addmul_1.S
new file mode 100644
index 0000000..4e2f6da
--- /dev/null
+++ b/ports/sysdeps/arm/addmul_1.S
@@ -0,0 +1,66 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+@ cycles/limb
+@ StrongArm   ?
+@ Cortex-A8   ?
+@ Cortex-A9   ?
+@ Cortex-A15   4
+
+/* mp_limb_t mpn_addmul_1(res_ptr, src1_ptr, size, s2_limb) */
+
+ENTRY(__mpn_addmul_1)
+ push { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+
+ ldr r6, [r1], #4
+ ldr r5, [r0]
+ mov r4, #0 /* init carry in */
+ b 1f
+0:
+ ldr r6, [r1], #4 /* load next ul */
+ adds r7, r4, r5 /* (out, c) = cl + lpl */
+ ldr r5, [r0, #4] /* load next rl */
+ adc r4, ip, #0 /* cl = hpl + c */
+ str r7, [r0], #4
+1:
+ mov ip, #0 /* zero-extend rl */
+ umlal r5, ip, r6, r3 /* (hpl, lpl) = ul * vl + rl */
+ subs r2, r2, #1
+ bne 0b
+
+ adds r4, r4, r5 /* (out, c) = cl + llpl */
+ str r4, [r0]
+ adc r0, ip, #0 /* return hpl + c */
+
+ pop { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (r4)
+ cfi_restore (r5)
+ cfi_restore (r6)
+ cfi_restore (r7)
+ DO_RET(lr)
+END(__mpn_addmul_1)
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 13/14] arm: Add optimized submul_1

Richard Henderson
In reply to this post by Richard Henderson
Written from scratch rather than copied from GMP, due to GPL 2.1 vs
GPL 3, but tested with the GMP testsuite.

This is 50% faster than the generic code as measured on Cortex-A15,
and the same speed as GMP on the same core.  It's probably slower
than GMP on the A8 and A9 cores though.
---
        * sysdeps/arm/submul_1.S: New file.
---
 ports/sysdeps/arm/submul_1.S | 67 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 ports/sysdeps/arm/submul_1.S

diff --git a/ports/sysdeps/arm/submul_1.S b/ports/sysdeps/arm/submul_1.S
new file mode 100644
index 0000000..35e1348
--- /dev/null
+++ b/ports/sysdeps/arm/submul_1.S
@@ -0,0 +1,67 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+@ cycles/limb
+@ StrongArm   ?
+@ Cortex-A8   ?
+@ Cortex-A9   ?
+@ Cortex-A15   4
+
+/* mp_limb_t mpn_submul_1(res_ptr, src1_ptr, size, s2_limb) */
+
+ENTRY(__mpn_submul_1)
+ push { r4, r5, r6, r7 }
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+
+ ldr r6, [r1], #4
+ ldr r7, [r0]
+ mov r4, #0 /* init carry in */
+ b 1f
+0:
+ ldr r6, [r1], #4 /* load next ul */
+ adds r5, r5, r4 /* (lpl, c) = lpl + cl */
+ adc r4, ip, #0 /* cl = hpl + c */
+ subs r5, r7, r5 /* (lpl, !c) = rl - lpl */
+ ldr r7, [r0, #4] /* load next rl */
+ it cc
+ addcc r4, r4, #1 /* cl += !c */
+ str r5, [r0], #4
+1:
+ umull r5, ip, r6, r3 /* (hpl, lpl) = ul * vl */
+ subs r2, r2, #1
+ bne 0b
+
+ adds r5, r5, r4 /* (lpl, c) = lpl + cl */
+ adc r4, ip, #0 /* cl = hpl + c */
+ subs r5, r7, r5 /* (lpl, !c) = rl - lpl */
+ str r5, [r0], #4
+ ite cc
+ addcc r0, r4, #1 /* cl += !c */
+ movcs r0, r4 /* return carry */
+
+ pop { r4, r5, r6, r7 }
+ DO_RET(lr)
+END(__mpn_submul_1)
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

[PATCH v2 14/14] arm: Add optimized add_n and sub_n

Richard Henderson
In reply to this post by Richard Henderson
Written from scratch rather than copied from GMP, due to LGPL 2.1 vs
GPL 3, but tested with the GMP testsuite.

This is 250% faster than the generic code as measured on Cortex-A15,
and the same speed as GMP on the same core, and probably everywhere.
---
        * sysdeps/arm/add_n.S: New file.
        * sysdeps/arm/sub_n.S: New file.
---
 ports/sysdeps/arm/add_n.S | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 ports/sysdeps/arm/sub_n.S |  2 ++
 2 files changed, 85 insertions(+)
 create mode 100644 ports/sysdeps/arm/add_n.S
 create mode 100644 ports/sysdeps/arm/sub_n.S

diff --git a/ports/sysdeps/arm/add_n.S b/ports/sysdeps/arm/add_n.S
new file mode 100644
index 0000000..af69733
--- /dev/null
+++ b/ports/sysdeps/arm/add_n.S
@@ -0,0 +1,83 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ .syntax unified
+ .text
+
+#ifdef USE_AS_SUB_N
+# define INITC cmp r0, r0
+# define OPC sbcs
+# define RETC sbc r0, r0, r0; neg r0, r0
+# define FUNC __mpn_sub_n
+#else
+# define INITC cmn r0, #0
+# define OPC adcs
+# define RETC mov r0, #0; adc r0, r0, r0
+# define FUNC __mpn_add_n
+#endif
+
+/* mp_limb_t mpn_add_n(res_ptr, src1_ptr, src2_ptr, size) */
+
+ENTRY (FUNC)
+ push { r4, r5, r6, r7, r8, r10, lr }
+ cfi_adjust_cfa_offset (28)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r5, 4)
+ cfi_rel_offset (r6, 8)
+ cfi_rel_offset (r7, 12)
+ cfi_rel_offset (r8, 16)
+ cfi_rel_offset (r10, 20)
+ cfi_rel_offset (lr, 24)
+
+ INITC /* initialize carry flag */
+ tst r3, #1 /* count & 1 == 1? */
+ add lr, r1, r3, lsl #2 /* compute end src1 */
+ beq 1f
+
+ ldr r4, [r1], #4 /* do one to make count even */
+ ldr r5, [r2], #4
+ OPC r4, r4, r5
+ teq r1, lr /* end of count? (preserve carry) */
+ str r4, [r0], #4
+ beq 9f
+1:
+ tst r3, #2 /* count & 2 == 2?  */
+ beq 2f
+ ldm r1!, { r4, r5 } /* do two to make count 0 mod 4 */
+ ldm r2!, { r6, r7 }
+ OPC r4, r4, r6
+ OPC r5, r5, r7
+ teq r1, lr /* end of count? */
+ stm r0!, { r4, r5 }
+ beq 9f
+2:
+ ldm r1!, { r3, r5, r7, r10 } /* do four each loop */
+ ldm r2!, { r4, r6, r8, ip }
+ OPC r3, r3, r4
+ OPC r5, r5, r6
+ OPC r7, r7, r8
+ OPC r10, r10, ip
+ teq r1, lr
+ stm r0!, { r3, r5, r7, r10 }
+ bne 2b
+
+9:
+ RETC /* copy carry out */
+ pop { r4, r5, r6, r7, r8, r10, pc }
+END (FUNC)
diff --git a/ports/sysdeps/arm/sub_n.S b/ports/sysdeps/arm/sub_n.S
new file mode 100644
index 0000000..8eafa41
--- /dev/null
+++ b/ports/sysdeps/arm/sub_n.S
@@ -0,0 +1,2 @@
+#define USE_AS_SUB_N
+#include "add_n.S"
--
1.8.1.2

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 09/14] arm: Tidy architecture selection

Roland McGrath-4
In reply to this post by Richard Henderson
Space before paren in defined (FOO).
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 10/14] arm: Implement hard-tp for GET_TLS

Roland McGrath-4
In reply to this post by Richard Henderson
Space before paren in defined (FOO).
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 02/14] arm: Introduce and use NEGOFF series of macros

Roland McGrath-4
In reply to this post by Richard Henderson
Space before paren.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 03/14] arm: Introduce and use GET_TLS

Roland McGrath-4
In reply to this post by Richard Henderson
Space before paren.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH v2 12/14] arm: Add optimized addmul_1

Roland McGrath-4
In reply to this post by Richard Henderson
Top line descriptive comment, please.
12