copy_user_memcpy.S 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. !
  2. ! Fast SH memcpy
  3. !
  4. ! by Toshiyasu Morita (tm@netcom.com)
  5. ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  6. ! SH5 code Copyright 2002 SuperH Ltd.
  7. !
  8. ! Entry: ARG0: destination pointer
  9. ! ARG1: source pointer
  10. ! ARG2: byte count
  11. !
  12. ! Exit: RESULT: destination pointer
  13. ! any other registers in the range r0-r7: trashed
  14. !
  15. ! Notes: Usually one wants to do small reads and write a longword, but
  16. ! unfortunately it is difficult in some cases to concatanate bytes
  17. ! into a longword on the SH, so this does a longword read and small
  18. ! writes.
  19. !
  20. ! This implementation makes two assumptions about how it is called:
  21. !
  22. ! 1.: If the byte count is nonzero, the address of the last byte to be
  23. ! copied is unsigned greater than the address of the first byte to
  24. ! be copied. This could be easily swapped for a signed comparison,
  25. ! but the algorithm used needs some comparison.
  26. !
  27. ! 2.: When there are two or three bytes in the last word of an 11-or-more
  28. ! bytes memory chunk to b copied, the rest of the word can be read
  29. ! without side effects.
  30. ! This could be easily changed by increasing the minimum size of
  31. ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  32. ! however, this would cost a few extra cyles on average.
  33. ! For SHmedia, the assumption is that any quadword can be read in its
  34. ! enirety if at least one byte is included in the copy.
  35. /* Imported into Linux kernel by Richard Curnow. This is used to implement the
  36. __copy_user function in the general case, so it has to be a distinct
  37. function from intra-kernel memcpy to allow for exception fix-ups in the
  38. event that the user pointer is bad somewhere in the copy (e.g. due to
  39. running off the end of the vma).
  40. Note, this algorithm will be slightly wasteful in the case where the source
  41. and destination pointers are equally aligned, because the stlo/sthi pairs
  42. could then be merged back into single stores. If there are a lot of cache
  43. misses, this is probably offset by the stall lengths on the preloads.
  44. */
  45. /* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
  46. * erratum. The first two prefetches are nop-ed out to avoid upsetting the
  47. * instruction counts used in the jump address calculation.
  48. * */
  49. .section .text..SHmedia32,"ax"
  50. .little
  51. .balign 32
  52. .global copy_user_memcpy
  53. .global copy_user_memcpy_end
  54. copy_user_memcpy:
  55. #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  56. #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  57. #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  58. #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  59. nop ! ld.b r3,0,r63 ! TAKum03020
  60. pta/l Large,tr0
  61. movi 25,r0
  62. bgeu/u r4,r0,tr0
  63. nsb r4,r0
  64. shlli r0,5,r0
  65. movi (L1-L0+63*32 + 1) & 0xffff,r1
  66. sub r1, r0, r0
  67. L0: ptrel r0,tr0
  68. add r2,r4,r5
  69. ptabs r18,tr1
  70. add r3,r4,r6
  71. blink tr0,r63
  72. /* Rearranged to make cut2 safe */
  73. .balign 8
  74. L4_7: /* 4..7 byte memcpy cntd. */
  75. stlo.l r2, 0, r0
  76. or r6, r7, r6
  77. sthi.l r5, -1, r6
  78. stlo.l r5, -4, r6
  79. blink tr1,r63
  80. .balign 8
  81. L1: /* 0 byte memcpy */
  82. nop
  83. blink tr1,r63
  84. nop
  85. nop
  86. nop
  87. nop
  88. L2_3: /* 2 or 3 byte memcpy cntd. */
  89. st.b r5,-1,r6
  90. blink tr1,r63
  91. /* 1 byte memcpy */
  92. ld.b r3,0,r0
  93. st.b r2,0,r0
  94. blink tr1,r63
  95. L8_15: /* 8..15 byte memcpy cntd. */
  96. stlo.q r2, 0, r0
  97. or r6, r7, r6
  98. sthi.q r5, -1, r6
  99. stlo.q r5, -8, r6
  100. blink tr1,r63
  101. /* 2 or 3 byte memcpy */
  102. ld.b r3,0,r0
  103. nop ! ld.b r2,0,r63 ! TAKum03020
  104. ld.b r3,1,r1
  105. st.b r2,0,r0
  106. pta/l L2_3,tr0
  107. ld.b r6,-1,r6
  108. st.b r2,1,r1
  109. blink tr0, r63
  110. /* 4 .. 7 byte memcpy */
  111. LDUAL (r3, 0, r0, r1)
  112. pta L4_7, tr0
  113. ldlo.l r6, -4, r7
  114. or r0, r1, r0
  115. sthi.l r2, 3, r0
  116. ldhi.l r6, -1, r6
  117. blink tr0, r63
  118. /* 8 .. 15 byte memcpy */
  119. LDUAQ (r3, 0, r0, r1)
  120. pta L8_15, tr0
  121. ldlo.q r6, -8, r7
  122. or r0, r1, r0
  123. sthi.q r2, 7, r0
  124. ldhi.q r6, -1, r6
  125. blink tr0, r63
  126. /* 16 .. 24 byte memcpy */
  127. LDUAQ (r3, 0, r0, r1)
  128. LDUAQ (r3, 8, r8, r9)
  129. or r0, r1, r0
  130. sthi.q r2, 7, r0
  131. or r8, r9, r8
  132. sthi.q r2, 15, r8
  133. ldlo.q r6, -8, r7
  134. ldhi.q r6, -1, r6
  135. stlo.q r2, 8, r8
  136. stlo.q r2, 0, r0
  137. or r6, r7, r6
  138. sthi.q r5, -1, r6
  139. stlo.q r5, -8, r6
  140. blink tr1,r63
  141. Large:
  142. ! ld.b r2, 0, r63 ! TAKum03020
  143. pta/l Loop_ua, tr1
  144. ori r3, -8, r7
  145. sub r2, r7, r22
  146. sub r3, r2, r6
  147. add r2, r4, r5
  148. ldlo.q r3, 0, r0
  149. addi r5, -16, r5
  150. movi 64+8, r27 ! could subtract r7 from that.
  151. stlo.q r2, 0, r0
  152. sthi.q r2, 7, r0
  153. ldx.q r22, r6, r0
  154. bgtu/l r27, r4, tr1
  155. addi r5, -48, r27
  156. pta/l Loop_line, tr0
  157. addi r6, 64, r36
  158. addi r6, -24, r19
  159. addi r6, -16, r20
  160. addi r6, -8, r21
  161. Loop_line:
  162. ! ldx.q r22, r36, r63 ! TAKum03020
  163. alloco r22, 32
  164. synco
  165. addi r22, 32, r22
  166. ldx.q r22, r19, r23
  167. sthi.q r22, -25, r0
  168. ldx.q r22, r20, r24
  169. ldx.q r22, r21, r25
  170. stlo.q r22, -32, r0
  171. ldx.q r22, r6, r0
  172. sthi.q r22, -17, r23
  173. sthi.q r22, -9, r24
  174. sthi.q r22, -1, r25
  175. stlo.q r22, -24, r23
  176. stlo.q r22, -16, r24
  177. stlo.q r22, -8, r25
  178. bgeu r27, r22, tr0
  179. Loop_ua:
  180. addi r22, 8, r22
  181. sthi.q r22, -1, r0
  182. stlo.q r22, -8, r0
  183. ldx.q r22, r6, r0
  184. bgtu/l r5, r22, tr1
  185. add r3, r4, r7
  186. ldlo.q r7, -8, r1
  187. sthi.q r22, 7, r0
  188. ldhi.q r7, -1, r7
  189. ptabs r18,tr1
  190. stlo.q r22, 0, r0
  191. or r1, r7, r1
  192. sthi.q r5, 15, r1
  193. stlo.q r5, 8, r1
  194. blink tr1, r63
  195. copy_user_memcpy_end:
  196. nop