memset.S 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /*
  2. * Copyright (C) 2013 ARM Ltd.
  3. * Copyright (C) 2013 Linaro.
  4. *
  5. * This code is based on glibc cortex strings work originally authored by Linaro
  6. * and re-licensed under GPLv2 for the Linux kernel. The original code can
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License version 2 as
  14. * published by the Free Software Foundation.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. */
  24. #include <linux/linkage.h>
  25. #include <asm/assembler.h>
  26. #include <asm/cache.h>
  27. /*
  28. * Fill in the buffer with character c (alignment handled by the hardware)
  29. *
  30. * Parameters:
  31. * x0 - buf
  32. * x1 - c
  33. * x2 - n
  34. * Returns:
  35. * x0 - buf
  36. */
  37. dstin .req x0
  38. val .req w1
  39. count .req x2
  40. tmp1 .req x3
  41. tmp1w .req w3
  42. tmp2 .req x4
  43. tmp2w .req w4
  44. zva_len_x .req x5
  45. zva_len .req w5
  46. zva_bits_x .req x6
  47. A_l .req x7
  48. A_lw .req w7
  49. dst .req x8
  50. tmp3w .req w9
  51. tmp3 .req x9
  52. .weak memset
  53. ENTRY(__memset)
  54. ENTRY(memset)
  55. mov dst, dstin /* Preserve return value. */
  56. and A_lw, val, #255
  57. orr A_lw, A_lw, A_lw, lsl #8
  58. orr A_lw, A_lw, A_lw, lsl #16
  59. orr A_l, A_l, A_l, lsl #32
  60. cmp count, #15
  61. b.hi .Lover16_proc
  62. /*All store maybe are non-aligned..*/
  63. tbz count, #3, 1f
  64. str A_l, [dst], #8
  65. 1:
  66. tbz count, #2, 2f
  67. str A_lw, [dst], #4
  68. 2:
  69. tbz count, #1, 3f
  70. strh A_lw, [dst], #2
  71. 3:
  72. tbz count, #0, 4f
  73. strb A_lw, [dst]
  74. 4:
  75. ret
  76. .Lover16_proc:
  77. /*Whether the start address is aligned with 16.*/
  78. neg tmp2, dst
  79. ands tmp2, tmp2, #15
  80. b.eq .Laligned
  81. /*
  82. * The count is not less than 16, we can use stp to store the start 16 bytes,
  83. * then adjust the dst aligned with 16.This process will make the current
  84. * memory address at alignment boundary.
  85. */
  86. stp A_l, A_l, [dst] /*non-aligned store..*/
  87. /*make the dst aligned..*/
  88. sub count, count, tmp2
  89. add dst, dst, tmp2
  90. .Laligned:
  91. cbz A_l, .Lzero_mem
  92. .Ltail_maybe_long:
  93. cmp count, #64
  94. b.ge .Lnot_short
  95. .Ltail63:
  96. ands tmp1, count, #0x30
  97. b.eq 3f
  98. cmp tmp1w, #0x20
  99. b.eq 1f
  100. b.lt 2f
  101. stp A_l, A_l, [dst], #16
  102. 1:
  103. stp A_l, A_l, [dst], #16
  104. 2:
  105. stp A_l, A_l, [dst], #16
  106. /*
  107. * The last store length is less than 16,use stp to write last 16 bytes.
  108. * It will lead some bytes written twice and the access is non-aligned.
  109. */
  110. 3:
  111. ands count, count, #15
  112. cbz count, 4f
  113. add dst, dst, count
  114. stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
  115. 4:
  116. ret
  117. /*
  118. * Critical loop. Start at a new cache line boundary. Assuming
  119. * 64 bytes per line, this ensures the entire loop is in one line.
  120. */
  121. .p2align L1_CACHE_SHIFT
  122. .Lnot_short:
  123. sub dst, dst, #16/* Pre-bias. */
  124. sub count, count, #64
  125. 1:
  126. stp A_l, A_l, [dst, #16]
  127. stp A_l, A_l, [dst, #32]
  128. stp A_l, A_l, [dst, #48]
  129. stp A_l, A_l, [dst, #64]!
  130. subs count, count, #64
  131. b.ge 1b
  132. tst count, #0x3f
  133. add dst, dst, #16
  134. b.ne .Ltail63
  135. .Lexitfunc:
  136. ret
  137. /*
  138. * For zeroing memory, check to see if we can use the ZVA feature to
  139. * zero entire 'cache' lines.
  140. */
  141. .Lzero_mem:
  142. cmp count, #63
  143. b.le .Ltail63
  144. /*
  145. * For zeroing small amounts of memory, it's not worth setting up
  146. * the line-clear code.
  147. */
  148. cmp count, #128
  149. b.lt .Lnot_short /*count is at least 128 bytes*/
  150. mrs tmp1, dczid_el0
  151. tbnz tmp1, #4, .Lnot_short
  152. mov tmp3w, #4
  153. and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
  154. lsl zva_len, tmp3w, zva_len
  155. ands tmp3w, zva_len, #63
  156. /*
  157. * ensure the zva_len is not less than 64.
  158. * It is not meaningful to use ZVA if the block size is less than 64.
  159. */
  160. b.ne .Lnot_short
  161. .Lzero_by_line:
  162. /*
  163. * Compute how far we need to go to become suitably aligned. We're
  164. * already at quad-word alignment.
  165. */
  166. cmp count, zva_len_x
  167. b.lt .Lnot_short /* Not enough to reach alignment. */
  168. sub zva_bits_x, zva_len_x, #1
  169. neg tmp2, dst
  170. ands tmp2, tmp2, zva_bits_x
  171. b.eq 2f /* Already aligned. */
  172. /* Not aligned, check that there's enough to copy after alignment.*/
  173. sub tmp1, count, tmp2
  174. /*
  175. * grantee the remain length to be ZVA is bigger than 64,
  176. * avoid to make the 2f's process over mem range.*/
  177. cmp tmp1, #64
  178. ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
  179. b.lt .Lnot_short
  180. /*
  181. * We know that there's at least 64 bytes to zero and that it's safe
  182. * to overrun by 64 bytes.
  183. */
  184. mov count, tmp1
  185. 1:
  186. stp A_l, A_l, [dst]
  187. stp A_l, A_l, [dst, #16]
  188. stp A_l, A_l, [dst, #32]
  189. subs tmp2, tmp2, #64
  190. stp A_l, A_l, [dst, #48]
  191. add dst, dst, #64
  192. b.ge 1b
  193. /* We've overrun a bit, so adjust dst downwards.*/
  194. add dst, dst, tmp2
  195. 2:
  196. sub count, count, zva_len_x
  197. 3:
  198. dc zva, dst
  199. add dst, dst, zva_len_x
  200. subs count, count, zva_len_x
  201. b.ge 3b
  202. ands count, count, zva_bits_x
  203. b.ne .Ltail_maybe_long
  204. ret
  205. ENDPIPROC(memset)
  206. ENDPROC(__memset)