checksum.S 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /*
  2. * arch/score/lib/csum_partial.S
  3. *
  4. * Score Processor version.
  5. *
  6. * Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
  7. * Lennox Wu <lennox.wu@sunplusct.com>
  8. * Chen Liqin <liqin.chen@sunplusct.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, see the file COPYING, or write
  22. * to the Free Software Foundation, Inc.,
  23. * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  24. */
  25. #include <linux/linkage.h>
  26. #define ADDC(sum,reg) \
  27. add sum, sum, reg; \
  28. cmp.c reg, sum; \
  29. bleu 9f; \
  30. addi sum, 0x1; \
  31. 9:
  32. #define CSUM_BIGCHUNK(src, offset, sum) \
  33. lw r8, [src, offset + 0x00]; \
  34. lw r9, [src, offset + 0x04]; \
  35. lw r10, [src, offset + 0x08]; \
  36. lw r11, [src, offset + 0x0c]; \
  37. ADDC(sum, r8); \
  38. ADDC(sum, r9); \
  39. ADDC(sum, r10); \
  40. ADDC(sum, r11); \
  41. lw r8, [src, offset + 0x10]; \
  42. lw r9, [src, offset + 0x14]; \
  43. lw r10, [src, offset + 0x18]; \
  44. lw r11, [src, offset + 0x1c]; \
  45. ADDC(sum, r8); \
  46. ADDC(sum, r9); \
  47. ADDC(sum, r10); \
  48. ADDC(sum, r11); \
  49. #define src r4
  50. #define dest r5
  51. #define sum r27
  52. .text
  53. /* unknown src alignment and < 8 bytes to go */
  54. small_csumcpy:
  55. mv r5, r10
  56. ldi r9, 0x0
  57. cmpi.c r25, 0x1
  58. beq pass_small_set_t7 /*already set, jump to pass_small_set_t7*/
  59. andri.c r25,r4 , 0x1 /*Is src 2 bytes aligned?*/
  60. pass_small_set_t7:
  61. beq aligned
  62. cmpi.c r5, 0x0
  63. beq fold
  64. lbu r9, [src]
  65. slli r9,r9, 0x8 /*Little endian*/
  66. ADDC(sum, r9)
  67. addi src, 0x1
  68. subi.c r5, 0x1
  69. /*len still a full word */
  70. aligned:
  71. andri.c r8, r5, 0x4 /*Len >= 4?*/
  72. beq len_less_4bytes
  73. /* Still a full word (4byte) to go,and the src is word aligned.*/
  74. andri.c r8, src, 0x3 /*src is 4bytes aligned, so use LW!!*/
  75. beq four_byte_aligned
  76. lhu r9, [src]
  77. addi src, 2
  78. ADDC(sum, r9)
  79. lhu r9, [src]
  80. addi src, 2
  81. ADDC(sum, r9)
  82. b len_less_4bytes
  83. four_byte_aligned: /* Len >=4 and four byte aligned */
  84. lw r9, [src]
  85. addi src, 4
  86. ADDC(sum, r9)
  87. len_less_4bytes: /* 2 byte aligned aligned and length<4B */
  88. andri.c r8, r5, 0x2
  89. beq len_less_2bytes
  90. lhu r9, [src]
  91. addi src, 0x2 /* src+=2 */
  92. ADDC(sum, r9)
  93. len_less_2bytes: /* len = 1 */
  94. andri.c r8, r5, 0x1
  95. beq fold /* less than 2 and not equal 1--> len=0 -> fold */
  96. lbu r9, [src]
  97. fold_ADDC:
  98. ADDC(sum, r9)
  99. fold:
  100. /* fold checksum */
  101. slli r26, sum, 16
  102. add sum, sum, r26
  103. cmp.c r26, sum
  104. srli sum, sum, 16
  105. bleu 1f /* if r26<=sum */
  106. addi sum, 0x1 /* r26>sum */
  107. 1:
  108. /* odd buffer alignment? r25 was set in csum_partial */
  109. cmpi.c r25, 0x0
  110. beq 1f
  111. slli r26, sum, 8
  112. srli sum, sum, 8
  113. or sum, sum, r26
  114. andi sum, 0xffff
  115. 1:
  116. .set optimize
  117. /* Add the passed partial csum. */
  118. ADDC(sum, r6)
  119. mv r4, sum
  120. br r3
  121. .set volatile
  122. .align 5
  123. ENTRY(csum_partial)
  124. ldi sum, 0
  125. ldi r25, 0
  126. mv r10, r5
  127. cmpi.c r5, 0x8
  128. blt small_csumcpy /* < 8(signed) bytes to copy */
  129. cmpi.c r5, 0x0
  130. beq out
  131. andri.c r25, src, 0x1 /* odd buffer? */
  132. beq word_align
  133. hword_align: /* 1 byte */
  134. lbu r8, [src]
  135. subi r5, 0x1
  136. slli r8, r8, 8
  137. ADDC(sum, r8)
  138. addi src, 0x1
  139. word_align: /* 2 bytes */
  140. andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */
  141. beq dword_align /* not, maybe dword_align */
  142. lhu r8, [src]
  143. subi r5, 0x2
  144. ADDC(sum, r8)
  145. addi src, 0x2
  146. dword_align: /* 4bytes */
  147. mv r26, r5 /* maybe useless when len >=56 */
  148. ldi r8, 56
  149. cmp.c r8, r5
  150. bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */
  151. andri.c r26, src, 0x4
  152. beq qword_align
  153. lw r8, [src]
  154. subi r5, 0x4
  155. ADDC(sum, r8)
  156. addi src, 0x4
  157. qword_align: /* 8 bytes */
  158. andri.c r26, src, 0x8
  159. beq oword_align
  160. lw r8, [src, 0x0]
  161. lw r9, [src, 0x4]
  162. subi r5, 0x8 /* len-=0x8 */
  163. ADDC(sum, r8)
  164. ADDC(sum, r9)
  165. addi src, 0x8
  166. oword_align: /* 16bytes */
  167. andri.c r26, src, 0x10
  168. beq begin_movement
  169. lw r10, [src, 0x08]
  170. lw r11, [src, 0x0c]
  171. lw r8, [src, 0x00]
  172. lw r9, [src, 0x04]
  173. ADDC(sum, r10)
  174. ADDC(sum, r11)
  175. ADDC(sum, r8)
  176. ADDC(sum, r9)
  177. subi r5, 0x10
  178. addi src, 0x10
  179. begin_movement:
  180. srli.c r26, r5, 0x7 /* len>=128? */
  181. beq 1f /* len<128 */
  182. /* r26 is the result that computed in oword_align */
  183. move_128bytes:
  184. CSUM_BIGCHUNK(src, 0x00, sum)
  185. CSUM_BIGCHUNK(src, 0x20, sum)
  186. CSUM_BIGCHUNK(src, 0x40, sum)
  187. CSUM_BIGCHUNK(src, 0x60, sum)
  188. subi.c r26, 0x01 /* r26 equals len/128 */
  189. addi src, 0x80
  190. bne move_128bytes
  191. 1: /* len<128,we process 64byte here */
  192. andri.c r10, r5, 0x40
  193. beq 1f
  194. move_64bytes:
  195. CSUM_BIGCHUNK(src, 0x00, sum)
  196. CSUM_BIGCHUNK(src, 0x20, sum)
  197. addi src, 0x40
  198. 1: /* len<64 */
  199. andri r26, r5, 0x1c /* 0x1c=28 */
  200. andri.c r10, r5, 0x20
  201. beq do_end_words /* decided by andri */
  202. move_32bytes:
  203. CSUM_BIGCHUNK(src, 0x00, sum)
  204. andri r26, r5, 0x1c
  205. addri src, src, 0x20
  206. do_end_words: /* len<32 */
  207. /* r26 was set already in dword_align */
  208. cmpi.c r26, 0x0
  209. beq maybe_end_cruft /* len<28 or len<56 */
  210. srli r26, r26, 0x2
  211. end_words:
  212. lw r8, [src]
  213. subi.c r26, 0x1 /* unit is 4 byte */
  214. ADDC(sum, r8)
  215. addi src, 0x4
  216. cmpi.c r26, 0x0
  217. bne end_words /* r26!=0 */
  218. maybe_end_cruft: /* len<4 */
  219. andri r10, r5, 0x3
  220. small_memcpy:
  221. mv r5, r10
  222. j small_csumcpy
  223. out:
  224. mv r4, sum
  225. br r3
  226. END(csum_partial)