aes-modes.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. /*
  2. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  3. *
  4. * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. /* included by aes-ce.S and aes-neon.S */
  11. .text
  12. .align 4
  13. /*
  14. * There are several ways to instantiate this code:
  15. * - no interleave, all inline
  16. * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
  17. * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
  18. * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
  19. * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
  20. *
  21. * Macros imported by this code:
  22. * - enc_prepare - setup NEON registers for encryption
  23. * - dec_prepare - setup NEON registers for decryption
  24. * - enc_switch_key - change to new key after having prepared for encryption
  25. * - encrypt_block - encrypt a single block
  26. * - decrypt block - decrypt a single block
  27. * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
  28. * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
  29. * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
  30. * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
  31. */
  32. #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
  33. #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
  34. #define FRAME_POP ldp x29, x30, [sp],#16
  35. #if INTERLEAVE == 2
  36. aes_encrypt_block2x:
  37. encrypt_block2x v0, v1, w3, x2, x6, w7
  38. ret
  39. ENDPROC(aes_encrypt_block2x)
  40. aes_decrypt_block2x:
  41. decrypt_block2x v0, v1, w3, x2, x6, w7
  42. ret
  43. ENDPROC(aes_decrypt_block2x)
  44. #elif INTERLEAVE == 4
  45. aes_encrypt_block4x:
  46. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  47. ret
  48. ENDPROC(aes_encrypt_block4x)
  49. aes_decrypt_block4x:
  50. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  51. ret
  52. ENDPROC(aes_decrypt_block4x)
  53. #else
  54. #error INTERLEAVE should equal 2 or 4
  55. #endif
  56. .macro do_encrypt_block2x
  57. bl aes_encrypt_block2x
  58. .endm
  59. .macro do_decrypt_block2x
  60. bl aes_decrypt_block2x
  61. .endm
  62. .macro do_encrypt_block4x
  63. bl aes_encrypt_block4x
  64. .endm
  65. .macro do_decrypt_block4x
  66. bl aes_decrypt_block4x
  67. .endm
  68. #else
  69. #define FRAME_PUSH
  70. #define FRAME_POP
  71. .macro do_encrypt_block2x
  72. encrypt_block2x v0, v1, w3, x2, x6, w7
  73. .endm
  74. .macro do_decrypt_block2x
  75. decrypt_block2x v0, v1, w3, x2, x6, w7
  76. .endm
  77. .macro do_encrypt_block4x
  78. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  79. .endm
  80. .macro do_decrypt_block4x
  81. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  82. .endm
  83. #endif
  84. /*
  85. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  86. * int blocks, int first)
  87. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  88. * int blocks, int first)
  89. */
  90. AES_ENTRY(aes_ecb_encrypt)
  91. FRAME_PUSH
  92. cbz w5, .LecbencloopNx
  93. enc_prepare w3, x2, x5
  94. .LecbencloopNx:
  95. #if INTERLEAVE >= 2
  96. subs w4, w4, #INTERLEAVE
  97. bmi .Lecbenc1x
  98. #if INTERLEAVE == 2
  99. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  100. do_encrypt_block2x
  101. st1 {v0.16b-v1.16b}, [x0], #32
  102. #else
  103. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  104. do_encrypt_block4x
  105. st1 {v0.16b-v3.16b}, [x0], #64
  106. #endif
  107. b .LecbencloopNx
  108. .Lecbenc1x:
  109. adds w4, w4, #INTERLEAVE
  110. beq .Lecbencout
  111. #endif
  112. .Lecbencloop:
  113. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  114. encrypt_block v0, w3, x2, x5, w6
  115. st1 {v0.16b}, [x0], #16
  116. subs w4, w4, #1
  117. bne .Lecbencloop
  118. .Lecbencout:
  119. FRAME_POP
  120. ret
  121. AES_ENDPROC(aes_ecb_encrypt)
  122. AES_ENTRY(aes_ecb_decrypt)
  123. FRAME_PUSH
  124. cbz w5, .LecbdecloopNx
  125. dec_prepare w3, x2, x5
  126. .LecbdecloopNx:
  127. #if INTERLEAVE >= 2
  128. subs w4, w4, #INTERLEAVE
  129. bmi .Lecbdec1x
  130. #if INTERLEAVE == 2
  131. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  132. do_decrypt_block2x
  133. st1 {v0.16b-v1.16b}, [x0], #32
  134. #else
  135. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  136. do_decrypt_block4x
  137. st1 {v0.16b-v3.16b}, [x0], #64
  138. #endif
  139. b .LecbdecloopNx
  140. .Lecbdec1x:
  141. adds w4, w4, #INTERLEAVE
  142. beq .Lecbdecout
  143. #endif
  144. .Lecbdecloop:
  145. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  146. decrypt_block v0, w3, x2, x5, w6
  147. st1 {v0.16b}, [x0], #16
  148. subs w4, w4, #1
  149. bne .Lecbdecloop
  150. .Lecbdecout:
  151. FRAME_POP
  152. ret
  153. AES_ENDPROC(aes_ecb_decrypt)
  154. /*
  155. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  156. * int blocks, u8 iv[], int first)
  157. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  158. * int blocks, u8 iv[], int first)
  159. */
  160. AES_ENTRY(aes_cbc_encrypt)
  161. cbz w6, .Lcbcencloop
  162. ld1 {v0.16b}, [x5] /* get iv */
  163. enc_prepare w3, x2, x6
  164. .Lcbcencloop:
  165. ld1 {v1.16b}, [x1], #16 /* get next pt block */
  166. eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
  167. encrypt_block v0, w3, x2, x6, w7
  168. st1 {v0.16b}, [x0], #16
  169. subs w4, w4, #1
  170. bne .Lcbcencloop
  171. st1 {v0.16b}, [x5] /* return iv */
  172. ret
  173. AES_ENDPROC(aes_cbc_encrypt)
  174. AES_ENTRY(aes_cbc_decrypt)
  175. FRAME_PUSH
  176. cbz w6, .LcbcdecloopNx
  177. ld1 {v7.16b}, [x5] /* get iv */
  178. dec_prepare w3, x2, x6
  179. .LcbcdecloopNx:
  180. #if INTERLEAVE >= 2
  181. subs w4, w4, #INTERLEAVE
  182. bmi .Lcbcdec1x
  183. #if INTERLEAVE == 2
  184. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  185. mov v2.16b, v0.16b
  186. mov v3.16b, v1.16b
  187. do_decrypt_block2x
  188. eor v0.16b, v0.16b, v7.16b
  189. eor v1.16b, v1.16b, v2.16b
  190. mov v7.16b, v3.16b
  191. st1 {v0.16b-v1.16b}, [x0], #32
  192. #else
  193. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  194. mov v4.16b, v0.16b
  195. mov v5.16b, v1.16b
  196. mov v6.16b, v2.16b
  197. do_decrypt_block4x
  198. sub x1, x1, #16
  199. eor v0.16b, v0.16b, v7.16b
  200. eor v1.16b, v1.16b, v4.16b
  201. ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
  202. eor v2.16b, v2.16b, v5.16b
  203. eor v3.16b, v3.16b, v6.16b
  204. st1 {v0.16b-v3.16b}, [x0], #64
  205. #endif
  206. b .LcbcdecloopNx
  207. .Lcbcdec1x:
  208. adds w4, w4, #INTERLEAVE
  209. beq .Lcbcdecout
  210. #endif
  211. .Lcbcdecloop:
  212. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  213. mov v0.16b, v1.16b /* ...and copy to v0 */
  214. decrypt_block v0, w3, x2, x6, w7
  215. eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
  216. mov v7.16b, v1.16b /* ct is next iv */
  217. st1 {v0.16b}, [x0], #16
  218. subs w4, w4, #1
  219. bne .Lcbcdecloop
  220. .Lcbcdecout:
  221. FRAME_POP
  222. st1 {v7.16b}, [x5] /* return iv */
  223. ret
  224. AES_ENDPROC(aes_cbc_decrypt)
  225. /*
  226. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  227. * int blocks, u8 ctr[], int first)
  228. */
  229. AES_ENTRY(aes_ctr_encrypt)
  230. FRAME_PUSH
  231. cbz w6, .Lctrnotfirst /* 1st time around? */
  232. enc_prepare w3, x2, x6
  233. ld1 {v4.16b}, [x5]
  234. .Lctrnotfirst:
  235. umov x8, v4.d[1] /* keep swabbed ctr in reg */
  236. rev x8, x8
  237. #if INTERLEAVE >= 2
  238. cmn w8, w4 /* 32 bit overflow? */
  239. bcs .Lctrloop
  240. .LctrloopNx:
  241. subs w4, w4, #INTERLEAVE
  242. bmi .Lctr1x
  243. #if INTERLEAVE == 2
  244. mov v0.8b, v4.8b
  245. mov v1.8b, v4.8b
  246. rev x7, x8
  247. add x8, x8, #1
  248. ins v0.d[1], x7
  249. rev x7, x8
  250. add x8, x8, #1
  251. ins v1.d[1], x7
  252. ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
  253. do_encrypt_block2x
  254. eor v0.16b, v0.16b, v2.16b
  255. eor v1.16b, v1.16b, v3.16b
  256. st1 {v0.16b-v1.16b}, [x0], #32
  257. #else
  258. ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
  259. dup v7.4s, w8
  260. mov v0.16b, v4.16b
  261. add v7.4s, v7.4s, v8.4s
  262. mov v1.16b, v4.16b
  263. rev32 v8.16b, v7.16b
  264. mov v2.16b, v4.16b
  265. mov v3.16b, v4.16b
  266. mov v1.s[3], v8.s[0]
  267. mov v2.s[3], v8.s[1]
  268. mov v3.s[3], v8.s[2]
  269. ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
  270. do_encrypt_block4x
  271. eor v0.16b, v5.16b, v0.16b
  272. ld1 {v5.16b}, [x1], #16 /* get 1 input block */
  273. eor v1.16b, v6.16b, v1.16b
  274. eor v2.16b, v7.16b, v2.16b
  275. eor v3.16b, v5.16b, v3.16b
  276. st1 {v0.16b-v3.16b}, [x0], #64
  277. add x8, x8, #INTERLEAVE
  278. #endif
  279. rev x7, x8
  280. ins v4.d[1], x7
  281. cbz w4, .Lctrout
  282. b .LctrloopNx
  283. .Lctr1x:
  284. adds w4, w4, #INTERLEAVE
  285. beq .Lctrout
  286. #endif
  287. .Lctrloop:
  288. mov v0.16b, v4.16b
  289. encrypt_block v0, w3, x2, x6, w7
  290. adds x8, x8, #1 /* increment BE ctr */
  291. rev x7, x8
  292. ins v4.d[1], x7
  293. bcs .Lctrcarry /* overflow? */
  294. .Lctrcarrydone:
  295. subs w4, w4, #1
  296. bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
  297. ld1 {v3.16b}, [x1], #16
  298. eor v3.16b, v0.16b, v3.16b
  299. st1 {v3.16b}, [x0], #16
  300. bne .Lctrloop
  301. .Lctrout:
  302. st1 {v4.16b}, [x5] /* return next CTR value */
  303. FRAME_POP
  304. ret
  305. .Lctrhalfblock:
  306. ld1 {v3.8b}, [x1]
  307. eor v3.8b, v0.8b, v3.8b
  308. st1 {v3.8b}, [x0]
  309. FRAME_POP
  310. ret
  311. .Lctrcarry:
  312. umov x7, v4.d[0] /* load upper word of ctr */
  313. rev x7, x7 /* ... to handle the carry */
  314. add x7, x7, #1
  315. rev x7, x7
  316. ins v4.d[0], x7
  317. b .Lctrcarrydone
  318. AES_ENDPROC(aes_ctr_encrypt)
  319. .ltorg
  320. /*
  321. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  322. * int blocks, u8 const rk2[], u8 iv[], int first)
  323. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  324. * int blocks, u8 const rk2[], u8 iv[], int first)
  325. */
  326. .macro next_tweak, out, in, const, tmp
  327. sshr \tmp\().2d, \in\().2d, #63
  328. and \tmp\().16b, \tmp\().16b, \const\().16b
  329. add \out\().2d, \in\().2d, \in\().2d
  330. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  331. eor \out\().16b, \out\().16b, \tmp\().16b
  332. .endm
  333. .Lxts_mul_x:
  334. CPU_LE( .quad 1, 0x87 )
  335. CPU_BE( .quad 0x87, 1 )
  336. AES_ENTRY(aes_xts_encrypt)
  337. FRAME_PUSH
  338. cbz w7, .LxtsencloopNx
  339. ld1 {v4.16b}, [x6]
  340. enc_prepare w3, x5, x6
  341. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  342. enc_switch_key w3, x2, x6
  343. ldr q7, .Lxts_mul_x
  344. b .LxtsencNx
  345. .LxtsencloopNx:
  346. ldr q7, .Lxts_mul_x
  347. next_tweak v4, v4, v7, v8
  348. .LxtsencNx:
  349. #if INTERLEAVE >= 2
  350. subs w4, w4, #INTERLEAVE
  351. bmi .Lxtsenc1x
  352. #if INTERLEAVE == 2
  353. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  354. next_tweak v5, v4, v7, v8
  355. eor v0.16b, v0.16b, v4.16b
  356. eor v1.16b, v1.16b, v5.16b
  357. do_encrypt_block2x
  358. eor v0.16b, v0.16b, v4.16b
  359. eor v1.16b, v1.16b, v5.16b
  360. st1 {v0.16b-v1.16b}, [x0], #32
  361. cbz w4, .LxtsencoutNx
  362. next_tweak v4, v5, v7, v8
  363. b .LxtsencNx
  364. .LxtsencoutNx:
  365. mov v4.16b, v5.16b
  366. b .Lxtsencout
  367. #else
  368. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  369. next_tweak v5, v4, v7, v8
  370. eor v0.16b, v0.16b, v4.16b
  371. next_tweak v6, v5, v7, v8
  372. eor v1.16b, v1.16b, v5.16b
  373. eor v2.16b, v2.16b, v6.16b
  374. next_tweak v7, v6, v7, v8
  375. eor v3.16b, v3.16b, v7.16b
  376. do_encrypt_block4x
  377. eor v3.16b, v3.16b, v7.16b
  378. eor v0.16b, v0.16b, v4.16b
  379. eor v1.16b, v1.16b, v5.16b
  380. eor v2.16b, v2.16b, v6.16b
  381. st1 {v0.16b-v3.16b}, [x0], #64
  382. mov v4.16b, v7.16b
  383. cbz w4, .Lxtsencout
  384. b .LxtsencloopNx
  385. #endif
  386. .Lxtsenc1x:
  387. adds w4, w4, #INTERLEAVE
  388. beq .Lxtsencout
  389. #endif
  390. .Lxtsencloop:
  391. ld1 {v1.16b}, [x1], #16
  392. eor v0.16b, v1.16b, v4.16b
  393. encrypt_block v0, w3, x2, x6, w7
  394. eor v0.16b, v0.16b, v4.16b
  395. st1 {v0.16b}, [x0], #16
  396. subs w4, w4, #1
  397. beq .Lxtsencout
  398. next_tweak v4, v4, v7, v8
  399. b .Lxtsencloop
  400. .Lxtsencout:
  401. FRAME_POP
  402. ret
  403. AES_ENDPROC(aes_xts_encrypt)
  404. AES_ENTRY(aes_xts_decrypt)
  405. FRAME_PUSH
  406. cbz w7, .LxtsdecloopNx
  407. ld1 {v4.16b}, [x6]
  408. enc_prepare w3, x5, x6
  409. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  410. dec_prepare w3, x2, x6
  411. ldr q7, .Lxts_mul_x
  412. b .LxtsdecNx
  413. .LxtsdecloopNx:
  414. ldr q7, .Lxts_mul_x
  415. next_tweak v4, v4, v7, v8
  416. .LxtsdecNx:
  417. #if INTERLEAVE >= 2
  418. subs w4, w4, #INTERLEAVE
  419. bmi .Lxtsdec1x
  420. #if INTERLEAVE == 2
  421. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  422. next_tweak v5, v4, v7, v8
  423. eor v0.16b, v0.16b, v4.16b
  424. eor v1.16b, v1.16b, v5.16b
  425. do_decrypt_block2x
  426. eor v0.16b, v0.16b, v4.16b
  427. eor v1.16b, v1.16b, v5.16b
  428. st1 {v0.16b-v1.16b}, [x0], #32
  429. cbz w4, .LxtsdecoutNx
  430. next_tweak v4, v5, v7, v8
  431. b .LxtsdecNx
  432. .LxtsdecoutNx:
  433. mov v4.16b, v5.16b
  434. b .Lxtsdecout
  435. #else
  436. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  437. next_tweak v5, v4, v7, v8
  438. eor v0.16b, v0.16b, v4.16b
  439. next_tweak v6, v5, v7, v8
  440. eor v1.16b, v1.16b, v5.16b
  441. eor v2.16b, v2.16b, v6.16b
  442. next_tweak v7, v6, v7, v8
  443. eor v3.16b, v3.16b, v7.16b
  444. do_decrypt_block4x
  445. eor v3.16b, v3.16b, v7.16b
  446. eor v0.16b, v0.16b, v4.16b
  447. eor v1.16b, v1.16b, v5.16b
  448. eor v2.16b, v2.16b, v6.16b
  449. st1 {v0.16b-v3.16b}, [x0], #64
  450. mov v4.16b, v7.16b
  451. cbz w4, .Lxtsdecout
  452. b .LxtsdecloopNx
  453. #endif
  454. .Lxtsdec1x:
  455. adds w4, w4, #INTERLEAVE
  456. beq .Lxtsdecout
  457. #endif
  458. .Lxtsdecloop:
  459. ld1 {v1.16b}, [x1], #16
  460. eor v0.16b, v1.16b, v4.16b
  461. decrypt_block v0, w3, x2, x6, w7
  462. eor v0.16b, v0.16b, v4.16b
  463. st1 {v0.16b}, [x0], #16
  464. subs w4, w4, #1
  465. beq .Lxtsdecout
  466. next_tweak v4, v4, v7, v8
  467. b .Lxtsdecloop
  468. .Lxtsdecout:
  469. FRAME_POP
  470. ret
  471. AES_ENDPROC(aes_xts_decrypt)