checksum_64.S 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/errno.h>
  17. #include <asm/ppc_asm.h>
  18. /*
  19. * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
  20. * len is in words and is always >= 5.
  21. *
  22. * In practice len == 5, but this is not guaranteed. So this code does not
  23. * attempt to use doubleword instructions.
  24. */
  25. _GLOBAL(ip_fast_csum)
  26. lwz r0,0(r3)
  27. lwzu r5,4(r3)
  28. addic. r4,r4,-2
  29. addc r0,r0,r5
  30. mtctr r4
  31. blelr-
  32. 1: lwzu r4,4(r3)
  33. adde r0,r0,r4
  34. bdnz 1b
  35. addze r0,r0 /* add in final carry */
  36. rldicl r4,r0,32,0 /* fold two 32-bit halves together */
  37. add r0,r0,r4
  38. srdi r0,r0,32
  39. rlwinm r3,r0,16,0,31 /* fold two halves together */
  40. add r3,r0,r3
  41. not r3,r3
  42. srwi r3,r3,16
  43. blr
  44. /*
  45. * Computes the checksum of a memory block at buff, length len,
  46. * and adds in "sum" (32-bit).
  47. *
  48. * csum_partial(r3=buff, r4=len, r5=sum)
  49. */
  50. _GLOBAL(csum_partial)
  51. addic r0,r5,0 /* clear carry */
  52. srdi. r6,r4,3 /* less than 8 bytes? */
  53. beq .Lcsum_tail_word
  54. /*
  55. * If only halfword aligned, align to a double word. Since odd
  56. * aligned addresses should be rare and they would require more
  57. * work to calculate the correct checksum, we ignore that case
  58. * and take the potential slowdown of unaligned loads.
  59. */
  60. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
  61. beq .Lcsum_aligned
  62. li r7,4
  63. sub r6,r7,r6
  64. mtctr r6
  65. 1:
  66. lhz r6,0(r3) /* align to doubleword */
  67. subi r4,r4,2
  68. addi r3,r3,2
  69. adde r0,r0,r6
  70. bdnz 1b
  71. .Lcsum_aligned:
  72. /*
  73. * We unroll the loop such that each iteration is 64 bytes with an
  74. * entry and exit limb of 64 bytes, meaning a minimum size of
  75. * 128 bytes.
  76. */
  77. srdi. r6,r4,7
  78. beq .Lcsum_tail_doublewords /* len < 128 */
  79. srdi r6,r4,6
  80. subi r6,r6,1
  81. mtctr r6
  82. stdu r1,-STACKFRAMESIZE(r1)
  83. std r14,STK_REG(R14)(r1)
  84. std r15,STK_REG(R15)(r1)
  85. std r16,STK_REG(R16)(r1)
  86. ld r6,0(r3)
  87. ld r9,8(r3)
  88. ld r10,16(r3)
  89. ld r11,24(r3)
  90. /*
  91. * On POWER6 and POWER7 back to back addes take 2 cycles because of
  92. * the XER dependency. This means the fastest this loop can go is
  93. * 16 cycles per iteration. The scheduling of the loop below has
  94. * been shown to hit this on both POWER6 and POWER7.
  95. */
  96. .align 5
  97. 2:
  98. adde r0,r0,r6
  99. ld r12,32(r3)
  100. ld r14,40(r3)
  101. adde r0,r0,r9
  102. ld r15,48(r3)
  103. ld r16,56(r3)
  104. addi r3,r3,64
  105. adde r0,r0,r10
  106. adde r0,r0,r11
  107. adde r0,r0,r12
  108. adde r0,r0,r14
  109. adde r0,r0,r15
  110. ld r6,0(r3)
  111. ld r9,8(r3)
  112. adde r0,r0,r16
  113. ld r10,16(r3)
  114. ld r11,24(r3)
  115. bdnz 2b
  116. adde r0,r0,r6
  117. ld r12,32(r3)
  118. ld r14,40(r3)
  119. adde r0,r0,r9
  120. ld r15,48(r3)
  121. ld r16,56(r3)
  122. addi r3,r3,64
  123. adde r0,r0,r10
  124. adde r0,r0,r11
  125. adde r0,r0,r12
  126. adde r0,r0,r14
  127. adde r0,r0,r15
  128. adde r0,r0,r16
  129. ld r14,STK_REG(R14)(r1)
  130. ld r15,STK_REG(R15)(r1)
  131. ld r16,STK_REG(R16)(r1)
  132. addi r1,r1,STACKFRAMESIZE
  133. andi. r4,r4,63
  134. .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
  135. srdi. r6,r4,3
  136. beq .Lcsum_tail_word
  137. mtctr r6
  138. 3:
  139. ld r6,0(r3)
  140. addi r3,r3,8
  141. adde r0,r0,r6
  142. bdnz 3b
  143. andi. r4,r4,7
  144. .Lcsum_tail_word: /* Up to 7 bytes to go */
  145. srdi. r6,r4,2
  146. beq .Lcsum_tail_halfword
  147. lwz r6,0(r3)
  148. addi r3,r3,4
  149. adde r0,r0,r6
  150. subi r4,r4,4
  151. .Lcsum_tail_halfword: /* Up to 3 bytes to go */
  152. srdi. r6,r4,1
  153. beq .Lcsum_tail_byte
  154. lhz r6,0(r3)
  155. addi r3,r3,2
  156. adde r0,r0,r6
  157. subi r4,r4,2
  158. .Lcsum_tail_byte: /* Up to 1 byte to go */
  159. andi. r6,r4,1
  160. beq .Lcsum_finish
  161. lbz r6,0(r3)
  162. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  163. adde r0,r0,r9
  164. .Lcsum_finish:
  165. addze r0,r0 /* add in final carry */
  166. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  167. add r3,r4,r0
  168. srdi r3,r3,32
  169. blr
  170. .macro srcnr
  171. 100:
  172. .section __ex_table,"a"
  173. .align 3
  174. .llong 100b,.Lsrc_error_nr
  175. .previous
  176. .endm
  177. .macro source
  178. 150:
  179. .section __ex_table,"a"
  180. .align 3
  181. .llong 150b,.Lsrc_error
  182. .previous
  183. .endm
  184. .macro dstnr
  185. 200:
  186. .section __ex_table,"a"
  187. .align 3
  188. .llong 200b,.Ldest_error_nr
  189. .previous
  190. .endm
  191. .macro dest
  192. 250:
  193. .section __ex_table,"a"
  194. .align 3
  195. .llong 250b,.Ldest_error
  196. .previous
  197. .endm
  198. /*
  199. * Computes the checksum of a memory block at src, length len,
  200. * and adds in "sum" (32-bit), while copying the block to dst.
  201. * If an access exception occurs on src or dst, it stores -EFAULT
  202. * to *src_err or *dst_err respectively. The caller must take any action
  203. * required in this case (zeroing memory, recalculating partial checksum etc).
  204. *
  205. * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  206. */
  207. _GLOBAL(csum_partial_copy_generic)
  208. addic r0,r6,0 /* clear carry */
  209. srdi. r6,r5,3 /* less than 8 bytes? */
  210. beq .Lcopy_tail_word
  211. /*
  212. * If only halfword aligned, align to a double word. Since odd
  213. * aligned addresses should be rare and they would require more
  214. * work to calculate the correct checksum, we ignore that case
  215. * and take the potential slowdown of unaligned loads.
  216. *
  217. * If the source and destination are relatively unaligned we only
  218. * align the source. This keeps things simple.
  219. */
  220. rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
  221. beq .Lcopy_aligned
  222. li r9,4
  223. sub r6,r9,r6
  224. mtctr r6
  225. 1:
  226. srcnr; lhz r6,0(r3) /* align to doubleword */
  227. subi r5,r5,2
  228. addi r3,r3,2
  229. adde r0,r0,r6
  230. dstnr; sth r6,0(r4)
  231. addi r4,r4,2
  232. bdnz 1b
  233. .Lcopy_aligned:
  234. /*
  235. * We unroll the loop such that each iteration is 64 bytes with an
  236. * entry and exit limb of 64 bytes, meaning a minimum size of
  237. * 128 bytes.
  238. */
  239. srdi. r6,r5,7
  240. beq .Lcopy_tail_doublewords /* len < 128 */
  241. srdi r6,r5,6
  242. subi r6,r6,1
  243. mtctr r6
  244. stdu r1,-STACKFRAMESIZE(r1)
  245. std r14,STK_REG(R14)(r1)
  246. std r15,STK_REG(R15)(r1)
  247. std r16,STK_REG(R16)(r1)
  248. source; ld r6,0(r3)
  249. source; ld r9,8(r3)
  250. source; ld r10,16(r3)
  251. source; ld r11,24(r3)
  252. /*
  253. * On POWER6 and POWER7 back to back addes take 2 cycles because of
  254. * the XER dependency. This means the fastest this loop can go is
  255. * 16 cycles per iteration. The scheduling of the loop below has
  256. * been shown to hit this on both POWER6 and POWER7.
  257. */
  258. .align 5
  259. 2:
  260. adde r0,r0,r6
  261. source; ld r12,32(r3)
  262. source; ld r14,40(r3)
  263. adde r0,r0,r9
  264. source; ld r15,48(r3)
  265. source; ld r16,56(r3)
  266. addi r3,r3,64
  267. adde r0,r0,r10
  268. dest; std r6,0(r4)
  269. dest; std r9,8(r4)
  270. adde r0,r0,r11
  271. dest; std r10,16(r4)
  272. dest; std r11,24(r4)
  273. adde r0,r0,r12
  274. dest; std r12,32(r4)
  275. dest; std r14,40(r4)
  276. adde r0,r0,r14
  277. dest; std r15,48(r4)
  278. dest; std r16,56(r4)
  279. addi r4,r4,64
  280. adde r0,r0,r15
  281. source; ld r6,0(r3)
  282. source; ld r9,8(r3)
  283. adde r0,r0,r16
  284. source; ld r10,16(r3)
  285. source; ld r11,24(r3)
  286. bdnz 2b
  287. adde r0,r0,r6
  288. source; ld r12,32(r3)
  289. source; ld r14,40(r3)
  290. adde r0,r0,r9
  291. source; ld r15,48(r3)
  292. source; ld r16,56(r3)
  293. addi r3,r3,64
  294. adde r0,r0,r10
  295. dest; std r6,0(r4)
  296. dest; std r9,8(r4)
  297. adde r0,r0,r11
  298. dest; std r10,16(r4)
  299. dest; std r11,24(r4)
  300. adde r0,r0,r12
  301. dest; std r12,32(r4)
  302. dest; std r14,40(r4)
  303. adde r0,r0,r14
  304. dest; std r15,48(r4)
  305. dest; std r16,56(r4)
  306. addi r4,r4,64
  307. adde r0,r0,r15
  308. adde r0,r0,r16
  309. ld r14,STK_REG(R14)(r1)
  310. ld r15,STK_REG(R15)(r1)
  311. ld r16,STK_REG(R16)(r1)
  312. addi r1,r1,STACKFRAMESIZE
  313. andi. r5,r5,63
  314. .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
  315. srdi. r6,r5,3
  316. beq .Lcopy_tail_word
  317. mtctr r6
  318. 3:
  319. srcnr; ld r6,0(r3)
  320. addi r3,r3,8
  321. adde r0,r0,r6
  322. dstnr; std r6,0(r4)
  323. addi r4,r4,8
  324. bdnz 3b
  325. andi. r5,r5,7
  326. .Lcopy_tail_word: /* Up to 7 bytes to go */
  327. srdi. r6,r5,2
  328. beq .Lcopy_tail_halfword
  329. srcnr; lwz r6,0(r3)
  330. addi r3,r3,4
  331. adde r0,r0,r6
  332. dstnr; stw r6,0(r4)
  333. addi r4,r4,4
  334. subi r5,r5,4
  335. .Lcopy_tail_halfword: /* Up to 3 bytes to go */
  336. srdi. r6,r5,1
  337. beq .Lcopy_tail_byte
  338. srcnr; lhz r6,0(r3)
  339. addi r3,r3,2
  340. adde r0,r0,r6
  341. dstnr; sth r6,0(r4)
  342. addi r4,r4,2
  343. subi r5,r5,2
  344. .Lcopy_tail_byte: /* Up to 1 byte to go */
  345. andi. r6,r5,1
  346. beq .Lcopy_finish
  347. srcnr; lbz r6,0(r3)
  348. sldi r9,r6,8 /* Pad the byte out to 16 bits */
  349. adde r0,r0,r9
  350. dstnr; stb r6,0(r4)
  351. .Lcopy_finish:
  352. addze r0,r0 /* add in final carry */
  353. rldicl r4,r0,32,0 /* fold two 32 bit halves together */
  354. add r3,r4,r0
  355. srdi r3,r3,32
  356. blr
  357. .Lsrc_error:
  358. ld r14,STK_REG(R14)(r1)
  359. ld r15,STK_REG(R15)(r1)
  360. ld r16,STK_REG(R16)(r1)
  361. addi r1,r1,STACKFRAMESIZE
  362. .Lsrc_error_nr:
  363. cmpdi 0,r7,0
  364. beqlr
  365. li r6,-EFAULT
  366. stw r6,0(r7)
  367. blr
  368. .Ldest_error:
  369. ld r14,STK_REG(R14)(r1)
  370. ld r15,STK_REG(R15)(r1)
  371. ld r16,STK_REG(R16)(r1)
  372. addi r1,r1,STACKFRAMESIZE
  373. .Ldest_error_nr:
  374. cmpdi 0,r8,0
  375. beqlr
  376. li r6,-EFAULT
  377. stw r6,0(r8)
  378. blr