memset.S 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /*
  2. * Copyright (c) 2011, The Linux Foundation. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License version 2 and
  6. * only version 2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License
  14. * along with this program; if not, write to the Free Software
  15. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16. * 02110-1301, USA.
  17. */
  18. /* HEXAGON assembly optimized memset */
  19. /* Replaces the standard library function memset */
  20. .macro HEXAGON_OPT_FUNC_BEGIN name
  21. .text
  22. .p2align 4
  23. .globl \name
  24. .type \name, @function
  25. \name:
  26. .endm
  27. .macro HEXAGON_OPT_FUNC_FINISH name
  28. .size \name, . - \name
  29. .endm
  30. /* FUNCTION: memset (v2 version) */
  31. #if __HEXAGON_ARCH__ < 3
  32. HEXAGON_OPT_FUNC_BEGIN memset
  33. {
  34. r6 = #8
  35. r7 = extractu(r0, #3 , #0)
  36. p0 = cmp.eq(r2, #0)
  37. p1 = cmp.gtu(r2, #7)
  38. }
  39. {
  40. r4 = vsplatb(r1)
  41. r8 = r0 /* leave r0 intact for return val */
  42. r9 = sub(r6, r7) /* bytes until double alignment */
  43. if p0 jumpr r31 /* count == 0, so return */
  44. }
  45. {
  46. r3 = #0
  47. r7 = #0
  48. p0 = tstbit(r9, #0)
  49. if p1 jump 2f /* skip byte loop */
  50. }
  51. /* less than 8 bytes to set, so just set a byte at a time and return */
  52. loop0(1f, r2) /* byte loop */
  53. .falign
  54. 1: /* byte loop */
  55. {
  56. memb(r8++#1) = r4
  57. }:endloop0
  58. jumpr r31
  59. .falign
  60. 2: /* skip byte loop */
  61. {
  62. r6 = #1
  63. p0 = tstbit(r9, #1)
  64. p1 = cmp.eq(r2, #1)
  65. if !p0 jump 3f /* skip initial byte store */
  66. }
  67. {
  68. memb(r8++#1) = r4
  69. r3:2 = sub(r3:2, r7:6)
  70. if p1 jumpr r31
  71. }
  72. .falign
  73. 3: /* skip initial byte store */
  74. {
  75. r6 = #2
  76. p0 = tstbit(r9, #2)
  77. p1 = cmp.eq(r2, #2)
  78. if !p0 jump 4f /* skip initial half store */
  79. }
  80. {
  81. memh(r8++#2) = r4
  82. r3:2 = sub(r3:2, r7:6)
  83. if p1 jumpr r31
  84. }
  85. .falign
  86. 4: /* skip initial half store */
  87. {
  88. r6 = #4
  89. p0 = cmp.gtu(r2, #7)
  90. p1 = cmp.eq(r2, #4)
  91. if !p0 jump 5f /* skip initial word store */
  92. }
  93. {
  94. memw(r8++#4) = r4
  95. r3:2 = sub(r3:2, r7:6)
  96. p0 = cmp.gtu(r2, #11)
  97. if p1 jumpr r31
  98. }
  99. .falign
  100. 5: /* skip initial word store */
  101. {
  102. r10 = lsr(r2, #3)
  103. p1 = cmp.eq(r3, #1)
  104. if !p0 jump 7f /* skip double loop */
  105. }
  106. {
  107. r5 = r4
  108. r6 = #8
  109. loop0(6f, r10) /* double loop */
  110. }
  111. /* set bytes a double word at a time */
  112. .falign
  113. 6: /* double loop */
  114. {
  115. memd(r8++#8) = r5:4
  116. r3:2 = sub(r3:2, r7:6)
  117. p1 = cmp.eq(r2, #8)
  118. }:endloop0
  119. .falign
  120. 7: /* skip double loop */
  121. {
  122. p0 = tstbit(r2, #2)
  123. if p1 jumpr r31
  124. }
  125. {
  126. r6 = #4
  127. p0 = tstbit(r2, #1)
  128. p1 = cmp.eq(r2, #4)
  129. if !p0 jump 8f /* skip final word store */
  130. }
  131. {
  132. memw(r8++#4) = r4
  133. r3:2 = sub(r3:2, r7:6)
  134. if p1 jumpr r31
  135. }
  136. .falign
  137. 8: /* skip final word store */
  138. {
  139. p1 = cmp.eq(r2, #2)
  140. if !p0 jump 9f /* skip final half store */
  141. }
  142. {
  143. memh(r8++#2) = r4
  144. if p1 jumpr r31
  145. }
  146. .falign
  147. 9: /* skip final half store */
  148. {
  149. memb(r8++#1) = r4
  150. jumpr r31
  151. }
  152. HEXAGON_OPT_FUNC_FINISH memset
  153. #endif
  154. /* FUNCTION: memset (v3 and higher version) */
  155. #if __HEXAGON_ARCH__ >= 3
  156. HEXAGON_OPT_FUNC_BEGIN memset
  157. {
  158. r7=vsplatb(r1)
  159. r6 = r0
  160. if (r2==#0) jump:nt .L1
  161. }
  162. {
  163. r5:4=combine(r7,r7)
  164. p0 = cmp.gtu(r2,#8)
  165. if (p0.new) jump:nt .L3
  166. }
  167. {
  168. r3 = r0
  169. loop0(.L47,r2)
  170. }
  171. .falign
  172. .L47:
  173. {
  174. memb(r3++#1) = r1
  175. }:endloop0 /* start=.L47 */
  176. jumpr r31
  177. .L3:
  178. {
  179. p0 = tstbit(r0,#0)
  180. if (!p0.new) jump:nt .L8
  181. p1 = cmp.eq(r2, #1)
  182. }
  183. {
  184. r6 = add(r0, #1)
  185. r2 = add(r2,#-1)
  186. memb(r0) = r1
  187. if (p1) jump .L1
  188. }
  189. .L8:
  190. {
  191. p0 = tstbit(r6,#1)
  192. if (!p0.new) jump:nt .L10
  193. }
  194. {
  195. r2 = add(r2,#-2)
  196. memh(r6++#2) = r7
  197. p0 = cmp.eq(r2, #2)
  198. if (p0.new) jump:nt .L1
  199. }
  200. .L10:
  201. {
  202. p0 = tstbit(r6,#2)
  203. if (!p0.new) jump:nt .L12
  204. }
  205. {
  206. r2 = add(r2,#-4)
  207. memw(r6++#4) = r7
  208. p0 = cmp.eq(r2, #4)
  209. if (p0.new) jump:nt .L1
  210. }
  211. .L12:
  212. {
  213. p0 = cmp.gtu(r2,#127)
  214. if (!p0.new) jump:nt .L14
  215. }
  216. r3 = and(r6,#31)
  217. if (r3==#0) jump:nt .L17
  218. {
  219. memd(r6++#8) = r5:4
  220. r2 = add(r2,#-8)
  221. }
  222. r3 = and(r6,#31)
  223. if (r3==#0) jump:nt .L17
  224. {
  225. memd(r6++#8) = r5:4
  226. r2 = add(r2,#-8)
  227. }
  228. r3 = and(r6,#31)
  229. if (r3==#0) jump:nt .L17
  230. {
  231. memd(r6++#8) = r5:4
  232. r2 = add(r2,#-8)
  233. }
  234. .L17:
  235. {
  236. r3 = lsr(r2,#5)
  237. if (r1!=#0) jump:nt .L18
  238. }
  239. {
  240. r8 = r3
  241. r3 = r6
  242. loop0(.L46,r3)
  243. }
  244. .falign
  245. .L46:
  246. {
  247. dczeroa(r6)
  248. r6 = add(r6,#32)
  249. r2 = add(r2,#-32)
  250. }:endloop0 /* start=.L46 */
  251. .L14:
  252. {
  253. p0 = cmp.gtu(r2,#7)
  254. if (!p0.new) jump:nt .L28
  255. r8 = lsr(r2,#3)
  256. }
  257. loop0(.L44,r8)
  258. .falign
  259. .L44:
  260. {
  261. memd(r6++#8) = r5:4
  262. r2 = add(r2,#-8)
  263. }:endloop0 /* start=.L44 */
  264. .L28:
  265. {
  266. p0 = tstbit(r2,#2)
  267. if (!p0.new) jump:nt .L33
  268. }
  269. {
  270. r2 = add(r2,#-4)
  271. memw(r6++#4) = r7
  272. }
  273. .L33:
  274. {
  275. p0 = tstbit(r2,#1)
  276. if (!p0.new) jump:nt .L35
  277. }
  278. {
  279. r2 = add(r2,#-2)
  280. memh(r6++#2) = r7
  281. }
  282. .L35:
  283. p0 = cmp.eq(r2,#1)
  284. if (p0) memb(r6) = r1
  285. .L1:
  286. jumpr r31
  287. .L18:
  288. loop0(.L45,r3)
  289. .falign
  290. .L45:
  291. dczeroa(r6)
  292. {
  293. memd(r6++#8) = r5:4
  294. r2 = add(r2,#-32)
  295. }
  296. memd(r6++#8) = r5:4
  297. memd(r6++#8) = r5:4
  298. {
  299. memd(r6++#8) = r5:4
  300. }:endloop0 /* start=.L45 */
  301. jump .L14
  302. HEXAGON_OPT_FUNC_FINISH memset
  303. #endif