copyuser_power7.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. #ifdef __BIG_ENDIAN__
  22. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  23. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  24. #else
  25. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  26. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  27. #endif
  28. .macro err1
  29. 100:
  30. .section __ex_table,"a"
  31. .align 3
  32. .llong 100b,.Ldo_err1
  33. .previous
  34. .endm
  35. .macro err2
  36. 200:
  37. .section __ex_table,"a"
  38. .align 3
  39. .llong 200b,.Ldo_err2
  40. .previous
  41. .endm
  42. #ifdef CONFIG_ALTIVEC
  43. .macro err3
  44. 300:
  45. .section __ex_table,"a"
  46. .align 3
  47. .llong 300b,.Ldo_err3
  48. .previous
  49. .endm
  50. .macro err4
  51. 400:
  52. .section __ex_table,"a"
  53. .align 3
  54. .llong 400b,.Ldo_err4
  55. .previous
  56. .endm
  57. .Ldo_err4:
  58. ld r16,STK_REG(R16)(r1)
  59. ld r15,STK_REG(R15)(r1)
  60. ld r14,STK_REG(R14)(r1)
  61. .Ldo_err3:
  62. bl exit_vmx_usercopy
  63. ld r0,STACKFRAMESIZE+16(r1)
  64. mtlr r0
  65. b .Lexit
  66. #endif /* CONFIG_ALTIVEC */
  67. .Ldo_err2:
  68. ld r22,STK_REG(R22)(r1)
  69. ld r21,STK_REG(R21)(r1)
  70. ld r20,STK_REG(R20)(r1)
  71. ld r19,STK_REG(R19)(r1)
  72. ld r18,STK_REG(R18)(r1)
  73. ld r17,STK_REG(R17)(r1)
  74. ld r16,STK_REG(R16)(r1)
  75. ld r15,STK_REG(R15)(r1)
  76. ld r14,STK_REG(R14)(r1)
  77. .Lexit:
  78. addi r1,r1,STACKFRAMESIZE
  79. .Ldo_err1:
  80. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  81. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  82. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  83. b __copy_tofrom_user_base
  84. _GLOBAL(__copy_tofrom_user_power7)
  85. #ifdef CONFIG_ALTIVEC
  86. cmpldi r5,16
  87. cmpldi cr1,r5,4096
  88. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  89. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  90. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  91. blt .Lshort_copy
  92. bgt cr1,.Lvmx_copy
  93. #else
  94. cmpldi r5,16
  95. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  96. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  97. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  98. blt .Lshort_copy
  99. #endif
  100. .Lnonvmx_copy:
  101. /* Get the source 8B aligned */
  102. neg r6,r4
  103. mtocrf 0x01,r6
  104. clrldi r6,r6,(64-3)
  105. bf cr7*4+3,1f
  106. err1; lbz r0,0(r4)
  107. addi r4,r4,1
  108. err1; stb r0,0(r3)
  109. addi r3,r3,1
  110. 1: bf cr7*4+2,2f
  111. err1; lhz r0,0(r4)
  112. addi r4,r4,2
  113. err1; sth r0,0(r3)
  114. addi r3,r3,2
  115. 2: bf cr7*4+1,3f
  116. err1; lwz r0,0(r4)
  117. addi r4,r4,4
  118. err1; stw r0,0(r3)
  119. addi r3,r3,4
  120. 3: sub r5,r5,r6
  121. cmpldi r5,128
  122. blt 5f
  123. mflr r0
  124. stdu r1,-STACKFRAMESIZE(r1)
  125. std r14,STK_REG(R14)(r1)
  126. std r15,STK_REG(R15)(r1)
  127. std r16,STK_REG(R16)(r1)
  128. std r17,STK_REG(R17)(r1)
  129. std r18,STK_REG(R18)(r1)
  130. std r19,STK_REG(R19)(r1)
  131. std r20,STK_REG(R20)(r1)
  132. std r21,STK_REG(R21)(r1)
  133. std r22,STK_REG(R22)(r1)
  134. std r0,STACKFRAMESIZE+16(r1)
  135. srdi r6,r5,7
  136. mtctr r6
  137. /* Now do cacheline (128B) sized loads and stores. */
  138. .align 5
  139. 4:
  140. err2; ld r0,0(r4)
  141. err2; ld r6,8(r4)
  142. err2; ld r7,16(r4)
  143. err2; ld r8,24(r4)
  144. err2; ld r9,32(r4)
  145. err2; ld r10,40(r4)
  146. err2; ld r11,48(r4)
  147. err2; ld r12,56(r4)
  148. err2; ld r14,64(r4)
  149. err2; ld r15,72(r4)
  150. err2; ld r16,80(r4)
  151. err2; ld r17,88(r4)
  152. err2; ld r18,96(r4)
  153. err2; ld r19,104(r4)
  154. err2; ld r20,112(r4)
  155. err2; ld r21,120(r4)
  156. addi r4,r4,128
  157. err2; std r0,0(r3)
  158. err2; std r6,8(r3)
  159. err2; std r7,16(r3)
  160. err2; std r8,24(r3)
  161. err2; std r9,32(r3)
  162. err2; std r10,40(r3)
  163. err2; std r11,48(r3)
  164. err2; std r12,56(r3)
  165. err2; std r14,64(r3)
  166. err2; std r15,72(r3)
  167. err2; std r16,80(r3)
  168. err2; std r17,88(r3)
  169. err2; std r18,96(r3)
  170. err2; std r19,104(r3)
  171. err2; std r20,112(r3)
  172. err2; std r21,120(r3)
  173. addi r3,r3,128
  174. bdnz 4b
  175. clrldi r5,r5,(64-7)
  176. ld r14,STK_REG(R14)(r1)
  177. ld r15,STK_REG(R15)(r1)
  178. ld r16,STK_REG(R16)(r1)
  179. ld r17,STK_REG(R17)(r1)
  180. ld r18,STK_REG(R18)(r1)
  181. ld r19,STK_REG(R19)(r1)
  182. ld r20,STK_REG(R20)(r1)
  183. ld r21,STK_REG(R21)(r1)
  184. ld r22,STK_REG(R22)(r1)
  185. addi r1,r1,STACKFRAMESIZE
  186. /* Up to 127B to go */
  187. 5: srdi r6,r5,4
  188. mtocrf 0x01,r6
  189. 6: bf cr7*4+1,7f
  190. err1; ld r0,0(r4)
  191. err1; ld r6,8(r4)
  192. err1; ld r7,16(r4)
  193. err1; ld r8,24(r4)
  194. err1; ld r9,32(r4)
  195. err1; ld r10,40(r4)
  196. err1; ld r11,48(r4)
  197. err1; ld r12,56(r4)
  198. addi r4,r4,64
  199. err1; std r0,0(r3)
  200. err1; std r6,8(r3)
  201. err1; std r7,16(r3)
  202. err1; std r8,24(r3)
  203. err1; std r9,32(r3)
  204. err1; std r10,40(r3)
  205. err1; std r11,48(r3)
  206. err1; std r12,56(r3)
  207. addi r3,r3,64
  208. /* Up to 63B to go */
  209. 7: bf cr7*4+2,8f
  210. err1; ld r0,0(r4)
  211. err1; ld r6,8(r4)
  212. err1; ld r7,16(r4)
  213. err1; ld r8,24(r4)
  214. addi r4,r4,32
  215. err1; std r0,0(r3)
  216. err1; std r6,8(r3)
  217. err1; std r7,16(r3)
  218. err1; std r8,24(r3)
  219. addi r3,r3,32
  220. /* Up to 31B to go */
  221. 8: bf cr7*4+3,9f
  222. err1; ld r0,0(r4)
  223. err1; ld r6,8(r4)
  224. addi r4,r4,16
  225. err1; std r0,0(r3)
  226. err1; std r6,8(r3)
  227. addi r3,r3,16
  228. 9: clrldi r5,r5,(64-4)
  229. /* Up to 15B to go */
  230. .Lshort_copy:
  231. mtocrf 0x01,r5
  232. bf cr7*4+0,12f
  233. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  234. err1; lwz r6,4(r4)
  235. addi r4,r4,8
  236. err1; stw r0,0(r3)
  237. err1; stw r6,4(r3)
  238. addi r3,r3,8
  239. 12: bf cr7*4+1,13f
  240. err1; lwz r0,0(r4)
  241. addi r4,r4,4
  242. err1; stw r0,0(r3)
  243. addi r3,r3,4
  244. 13: bf cr7*4+2,14f
  245. err1; lhz r0,0(r4)
  246. addi r4,r4,2
  247. err1; sth r0,0(r3)
  248. addi r3,r3,2
  249. 14: bf cr7*4+3,15f
  250. err1; lbz r0,0(r4)
  251. err1; stb r0,0(r3)
  252. 15: li r3,0
  253. blr
  254. .Lunwind_stack_nonvmx_copy:
  255. addi r1,r1,STACKFRAMESIZE
  256. b .Lnonvmx_copy
  257. #ifdef CONFIG_ALTIVEC
  258. .Lvmx_copy:
  259. mflr r0
  260. std r0,16(r1)
  261. stdu r1,-STACKFRAMESIZE(r1)
  262. bl enter_vmx_usercopy
  263. cmpwi cr1,r3,0
  264. ld r0,STACKFRAMESIZE+16(r1)
  265. ld r3,STK_REG(R31)(r1)
  266. ld r4,STK_REG(R30)(r1)
  267. ld r5,STK_REG(R29)(r1)
  268. mtlr r0
  269. /*
  270. * We prefetch both the source and destination using enhanced touch
  271. * instructions. We use a stream ID of 0 for the load side and
  272. * 1 for the store side.
  273. */
  274. clrrdi r6,r4,7
  275. clrrdi r9,r3,7
  276. ori r9,r9,1 /* stream=1 */
  277. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  278. cmpldi r7,0x3FF
  279. ble 1f
  280. li r7,0x3FF
  281. 1: lis r0,0x0E00 /* depth=7 */
  282. sldi r7,r7,7
  283. or r7,r7,r0
  284. ori r10,r7,1 /* stream=1 */
  285. lis r8,0x8000 /* GO=1 */
  286. clrldi r8,r8,32
  287. .machine push
  288. .machine "power4"
  289. /* setup read stream 0 */
  290. dcbt r0,r6,0b01000 /* addr from */
  291. dcbt r0,r7,0b01010 /* length and depth from */
  292. /* setup write stream 1 */
  293. dcbtst r0,r9,0b01000 /* addr to */
  294. dcbtst r0,r10,0b01010 /* length and depth to */
  295. eieio
  296. dcbt r0,r8,0b01010 /* all streams GO */
  297. .machine pop
  298. beq cr1,.Lunwind_stack_nonvmx_copy
  299. /*
  300. * If source and destination are not relatively aligned we use a
  301. * slower permute loop.
  302. */
  303. xor r6,r4,r3
  304. rldicl. r6,r6,0,(64-4)
  305. bne .Lvmx_unaligned_copy
  306. /* Get the destination 16B aligned */
  307. neg r6,r3
  308. mtocrf 0x01,r6
  309. clrldi r6,r6,(64-4)
  310. bf cr7*4+3,1f
  311. err3; lbz r0,0(r4)
  312. addi r4,r4,1
  313. err3; stb r0,0(r3)
  314. addi r3,r3,1
  315. 1: bf cr7*4+2,2f
  316. err3; lhz r0,0(r4)
  317. addi r4,r4,2
  318. err3; sth r0,0(r3)
  319. addi r3,r3,2
  320. 2: bf cr7*4+1,3f
  321. err3; lwz r0,0(r4)
  322. addi r4,r4,4
  323. err3; stw r0,0(r3)
  324. addi r3,r3,4
  325. 3: bf cr7*4+0,4f
  326. err3; ld r0,0(r4)
  327. addi r4,r4,8
  328. err3; std r0,0(r3)
  329. addi r3,r3,8
  330. 4: sub r5,r5,r6
  331. /* Get the desination 128B aligned */
  332. neg r6,r3
  333. srdi r7,r6,4
  334. mtocrf 0x01,r7
  335. clrldi r6,r6,(64-7)
  336. li r9,16
  337. li r10,32
  338. li r11,48
  339. bf cr7*4+3,5f
  340. err3; lvx v1,r0,r4
  341. addi r4,r4,16
  342. err3; stvx v1,r0,r3
  343. addi r3,r3,16
  344. 5: bf cr7*4+2,6f
  345. err3; lvx v1,r0,r4
  346. err3; lvx v0,r4,r9
  347. addi r4,r4,32
  348. err3; stvx v1,r0,r3
  349. err3; stvx v0,r3,r9
  350. addi r3,r3,32
  351. 6: bf cr7*4+1,7f
  352. err3; lvx v3,r0,r4
  353. err3; lvx v2,r4,r9
  354. err3; lvx v1,r4,r10
  355. err3; lvx v0,r4,r11
  356. addi r4,r4,64
  357. err3; stvx v3,r0,r3
  358. err3; stvx v2,r3,r9
  359. err3; stvx v1,r3,r10
  360. err3; stvx v0,r3,r11
  361. addi r3,r3,64
  362. 7: sub r5,r5,r6
  363. srdi r6,r5,7
  364. std r14,STK_REG(R14)(r1)
  365. std r15,STK_REG(R15)(r1)
  366. std r16,STK_REG(R16)(r1)
  367. li r12,64
  368. li r14,80
  369. li r15,96
  370. li r16,112
  371. mtctr r6
  372. /*
  373. * Now do cacheline sized loads and stores. By this stage the
  374. * cacheline stores are also cacheline aligned.
  375. */
  376. .align 5
  377. 8:
  378. err4; lvx v7,r0,r4
  379. err4; lvx v6,r4,r9
  380. err4; lvx v5,r4,r10
  381. err4; lvx v4,r4,r11
  382. err4; lvx v3,r4,r12
  383. err4; lvx v2,r4,r14
  384. err4; lvx v1,r4,r15
  385. err4; lvx v0,r4,r16
  386. addi r4,r4,128
  387. err4; stvx v7,r0,r3
  388. err4; stvx v6,r3,r9
  389. err4; stvx v5,r3,r10
  390. err4; stvx v4,r3,r11
  391. err4; stvx v3,r3,r12
  392. err4; stvx v2,r3,r14
  393. err4; stvx v1,r3,r15
  394. err4; stvx v0,r3,r16
  395. addi r3,r3,128
  396. bdnz 8b
  397. ld r14,STK_REG(R14)(r1)
  398. ld r15,STK_REG(R15)(r1)
  399. ld r16,STK_REG(R16)(r1)
  400. /* Up to 127B to go */
  401. clrldi r5,r5,(64-7)
  402. srdi r6,r5,4
  403. mtocrf 0x01,r6
  404. bf cr7*4+1,9f
  405. err3; lvx v3,r0,r4
  406. err3; lvx v2,r4,r9
  407. err3; lvx v1,r4,r10
  408. err3; lvx v0,r4,r11
  409. addi r4,r4,64
  410. err3; stvx v3,r0,r3
  411. err3; stvx v2,r3,r9
  412. err3; stvx v1,r3,r10
  413. err3; stvx v0,r3,r11
  414. addi r3,r3,64
  415. 9: bf cr7*4+2,10f
  416. err3; lvx v1,r0,r4
  417. err3; lvx v0,r4,r9
  418. addi r4,r4,32
  419. err3; stvx v1,r0,r3
  420. err3; stvx v0,r3,r9
  421. addi r3,r3,32
  422. 10: bf cr7*4+3,11f
  423. err3; lvx v1,r0,r4
  424. addi r4,r4,16
  425. err3; stvx v1,r0,r3
  426. addi r3,r3,16
  427. /* Up to 15B to go */
  428. 11: clrldi r5,r5,(64-4)
  429. mtocrf 0x01,r5
  430. bf cr7*4+0,12f
  431. err3; ld r0,0(r4)
  432. addi r4,r4,8
  433. err3; std r0,0(r3)
  434. addi r3,r3,8
  435. 12: bf cr7*4+1,13f
  436. err3; lwz r0,0(r4)
  437. addi r4,r4,4
  438. err3; stw r0,0(r3)
  439. addi r3,r3,4
  440. 13: bf cr7*4+2,14f
  441. err3; lhz r0,0(r4)
  442. addi r4,r4,2
  443. err3; sth r0,0(r3)
  444. addi r3,r3,2
  445. 14: bf cr7*4+3,15f
  446. err3; lbz r0,0(r4)
  447. err3; stb r0,0(r3)
  448. 15: addi r1,r1,STACKFRAMESIZE
  449. b exit_vmx_usercopy /* tail call optimise */
  450. .Lvmx_unaligned_copy:
  451. /* Get the destination 16B aligned */
  452. neg r6,r3
  453. mtocrf 0x01,r6
  454. clrldi r6,r6,(64-4)
  455. bf cr7*4+3,1f
  456. err3; lbz r0,0(r4)
  457. addi r4,r4,1
  458. err3; stb r0,0(r3)
  459. addi r3,r3,1
  460. 1: bf cr7*4+2,2f
  461. err3; lhz r0,0(r4)
  462. addi r4,r4,2
  463. err3; sth r0,0(r3)
  464. addi r3,r3,2
  465. 2: bf cr7*4+1,3f
  466. err3; lwz r0,0(r4)
  467. addi r4,r4,4
  468. err3; stw r0,0(r3)
  469. addi r3,r3,4
  470. 3: bf cr7*4+0,4f
  471. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  472. err3; lwz r7,4(r4)
  473. addi r4,r4,8
  474. err3; stw r0,0(r3)
  475. err3; stw r7,4(r3)
  476. addi r3,r3,8
  477. 4: sub r5,r5,r6
  478. /* Get the desination 128B aligned */
  479. neg r6,r3
  480. srdi r7,r6,4
  481. mtocrf 0x01,r7
  482. clrldi r6,r6,(64-7)
  483. li r9,16
  484. li r10,32
  485. li r11,48
  486. LVS(v16,0,r4) /* Setup permute control vector */
  487. err3; lvx v0,0,r4
  488. addi r4,r4,16
  489. bf cr7*4+3,5f
  490. err3; lvx v1,r0,r4
  491. VPERM(v8,v0,v1,v16)
  492. addi r4,r4,16
  493. err3; stvx v8,r0,r3
  494. addi r3,r3,16
  495. vor v0,v1,v1
  496. 5: bf cr7*4+2,6f
  497. err3; lvx v1,r0,r4
  498. VPERM(v8,v0,v1,v16)
  499. err3; lvx v0,r4,r9
  500. VPERM(v9,v1,v0,v16)
  501. addi r4,r4,32
  502. err3; stvx v8,r0,r3
  503. err3; stvx v9,r3,r9
  504. addi r3,r3,32
  505. 6: bf cr7*4+1,7f
  506. err3; lvx v3,r0,r4
  507. VPERM(v8,v0,v3,v16)
  508. err3; lvx v2,r4,r9
  509. VPERM(v9,v3,v2,v16)
  510. err3; lvx v1,r4,r10
  511. VPERM(v10,v2,v1,v16)
  512. err3; lvx v0,r4,r11
  513. VPERM(v11,v1,v0,v16)
  514. addi r4,r4,64
  515. err3; stvx v8,r0,r3
  516. err3; stvx v9,r3,r9
  517. err3; stvx v10,r3,r10
  518. err3; stvx v11,r3,r11
  519. addi r3,r3,64
  520. 7: sub r5,r5,r6
  521. srdi r6,r5,7
  522. std r14,STK_REG(R14)(r1)
  523. std r15,STK_REG(R15)(r1)
  524. std r16,STK_REG(R16)(r1)
  525. li r12,64
  526. li r14,80
  527. li r15,96
  528. li r16,112
  529. mtctr r6
  530. /*
  531. * Now do cacheline sized loads and stores. By this stage the
  532. * cacheline stores are also cacheline aligned.
  533. */
  534. .align 5
  535. 8:
  536. err4; lvx v7,r0,r4
  537. VPERM(v8,v0,v7,v16)
  538. err4; lvx v6,r4,r9
  539. VPERM(v9,v7,v6,v16)
  540. err4; lvx v5,r4,r10
  541. VPERM(v10,v6,v5,v16)
  542. err4; lvx v4,r4,r11
  543. VPERM(v11,v5,v4,v16)
  544. err4; lvx v3,r4,r12
  545. VPERM(v12,v4,v3,v16)
  546. err4; lvx v2,r4,r14
  547. VPERM(v13,v3,v2,v16)
  548. err4; lvx v1,r4,r15
  549. VPERM(v14,v2,v1,v16)
  550. err4; lvx v0,r4,r16
  551. VPERM(v15,v1,v0,v16)
  552. addi r4,r4,128
  553. err4; stvx v8,r0,r3
  554. err4; stvx v9,r3,r9
  555. err4; stvx v10,r3,r10
  556. err4; stvx v11,r3,r11
  557. err4; stvx v12,r3,r12
  558. err4; stvx v13,r3,r14
  559. err4; stvx v14,r3,r15
  560. err4; stvx v15,r3,r16
  561. addi r3,r3,128
  562. bdnz 8b
  563. ld r14,STK_REG(R14)(r1)
  564. ld r15,STK_REG(R15)(r1)
  565. ld r16,STK_REG(R16)(r1)
  566. /* Up to 127B to go */
  567. clrldi r5,r5,(64-7)
  568. srdi r6,r5,4
  569. mtocrf 0x01,r6
  570. bf cr7*4+1,9f
  571. err3; lvx v3,r0,r4
  572. VPERM(v8,v0,v3,v16)
  573. err3; lvx v2,r4,r9
  574. VPERM(v9,v3,v2,v16)
  575. err3; lvx v1,r4,r10
  576. VPERM(v10,v2,v1,v16)
  577. err3; lvx v0,r4,r11
  578. VPERM(v11,v1,v0,v16)
  579. addi r4,r4,64
  580. err3; stvx v8,r0,r3
  581. err3; stvx v9,r3,r9
  582. err3; stvx v10,r3,r10
  583. err3; stvx v11,r3,r11
  584. addi r3,r3,64
  585. 9: bf cr7*4+2,10f
  586. err3; lvx v1,r0,r4
  587. VPERM(v8,v0,v1,v16)
  588. err3; lvx v0,r4,r9
  589. VPERM(v9,v1,v0,v16)
  590. addi r4,r4,32
  591. err3; stvx v8,r0,r3
  592. err3; stvx v9,r3,r9
  593. addi r3,r3,32
  594. 10: bf cr7*4+3,11f
  595. err3; lvx v1,r0,r4
  596. VPERM(v8,v0,v1,v16)
  597. addi r4,r4,16
  598. err3; stvx v8,r0,r3
  599. addi r3,r3,16
  600. /* Up to 15B to go */
  601. 11: clrldi r5,r5,(64-4)
  602. addi r4,r4,-16 /* Unwind the +16 load offset */
  603. mtocrf 0x01,r5
  604. bf cr7*4+0,12f
  605. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  606. err3; lwz r6,4(r4)
  607. addi r4,r4,8
  608. err3; stw r0,0(r3)
  609. err3; stw r6,4(r3)
  610. addi r3,r3,8
  611. 12: bf cr7*4+1,13f
  612. err3; lwz r0,0(r4)
  613. addi r4,r4,4
  614. err3; stw r0,0(r3)
  615. addi r3,r3,4
  616. 13: bf cr7*4+2,14f
  617. err3; lhz r0,0(r4)
  618. addi r4,r4,2
  619. err3; sth r0,0(r3)
  620. addi r3,r3,2
  621. 14: bf cr7*4+3,15f
  622. err3; lbz r0,0(r4)
  623. err3; stb r0,0(r3)
  624. 15: addi r1,r1,STACKFRAMESIZE
  625. b exit_vmx_usercopy /* tail call optimise */
  626. #endif /* CONFIG_ALTIVEC */