aes_ctrby8_avx-x86_64.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /*
  2. * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
  3. *
  4. * This is AES128/192/256 CTR mode optimization implementation. It requires
  5. * the support of Intel(R) AESNI and AVX instructions.
  6. *
  7. * This work was inspired by the AES CTR mode optimization published
  8. * in Intel Optimized IPSEC Cryptograhpic library.
  9. * Additional information on it can be found at:
  10. * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11. *
  12. * This file is provided under a dual BSD/GPLv2 license. When using or
  13. * redistributing this file, you may do so under either license.
  14. *
  15. * GPL LICENSE SUMMARY
  16. *
  17. * Copyright(c) 2014 Intel Corporation.
  18. *
  19. * This program is free software; you can redistribute it and/or modify
  20. * it under the terms of version 2 of the GNU General Public License as
  21. * published by the Free Software Foundation.
  22. *
  23. * This program is distributed in the hope that it will be useful, but
  24. * WITHOUT ANY WARRANTY; without even the implied warranty of
  25. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  26. * General Public License for more details.
  27. *
  28. * Contact Information:
  29. * James Guilford <james.guilford@intel.com>
  30. * Sean Gulley <sean.m.gulley@intel.com>
  31. * Chandramouli Narayanan <mouli@linux.intel.com>
  32. *
  33. * BSD LICENSE
  34. *
  35. * Copyright(c) 2014 Intel Corporation.
  36. *
  37. * Redistribution and use in source and binary forms, with or without
  38. * modification, are permitted provided that the following conditions
  39. * are met:
  40. *
  41. * Redistributions of source code must retain the above copyright
  42. * notice, this list of conditions and the following disclaimer.
  43. * Redistributions in binary form must reproduce the above copyright
  44. * notice, this list of conditions and the following disclaimer in
  45. * the documentation and/or other materials provided with the
  46. * distribution.
  47. * Neither the name of Intel Corporation nor the names of its
  48. * contributors may be used to endorse or promote products derived
  49. * from this software without specific prior written permission.
  50. *
  51. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62. *
  63. */
  64. #include <linux/linkage.h>
  65. #include <asm/inst.h>
  66. #define VMOVDQ vmovdqu
  67. #define xdata0 %xmm0
  68. #define xdata1 %xmm1
  69. #define xdata2 %xmm2
  70. #define xdata3 %xmm3
  71. #define xdata4 %xmm4
  72. #define xdata5 %xmm5
  73. #define xdata6 %xmm6
  74. #define xdata7 %xmm7
  75. #define xcounter %xmm8
  76. #define xbyteswap %xmm9
  77. #define xkey0 %xmm10
  78. #define xkey4 %xmm11
  79. #define xkey8 %xmm12
  80. #define xkey12 %xmm13
  81. #define xkeyA %xmm14
  82. #define xkeyB %xmm15
  83. #define p_in %rdi
  84. #define p_iv %rsi
  85. #define p_keys %rdx
  86. #define p_out %rcx
  87. #define num_bytes %r8
  88. #define tmp %r10
  89. #define DDQ_DATA 0
  90. #define XDATA 1
  91. #define KEY_128 1
  92. #define KEY_192 2
  93. #define KEY_256 3
  94. .section .rodata
  95. .align 16
  96. byteswap_const:
  97. .octa 0x000102030405060708090A0B0C0D0E0F
  98. ddq_low_msk:
  99. .octa 0x0000000000000000FFFFFFFFFFFFFFFF
  100. ddq_high_add_1:
  101. .octa 0x00000000000000010000000000000000
  102. ddq_add_1:
  103. .octa 0x00000000000000000000000000000001
  104. ddq_add_2:
  105. .octa 0x00000000000000000000000000000002
  106. ddq_add_3:
  107. .octa 0x00000000000000000000000000000003
  108. ddq_add_4:
  109. .octa 0x00000000000000000000000000000004
  110. ddq_add_5:
  111. .octa 0x00000000000000000000000000000005
  112. ddq_add_6:
  113. .octa 0x00000000000000000000000000000006
  114. ddq_add_7:
  115. .octa 0x00000000000000000000000000000007
  116. ddq_add_8:
  117. .octa 0x00000000000000000000000000000008
  118. .text
  119. /* generate a unique variable for ddq_add_x */
  120. .macro setddq n
  121. var_ddq_add = ddq_add_\n
  122. .endm
  123. /* generate a unique variable for xmm register */
  124. .macro setxdata n
  125. var_xdata = %xmm\n
  126. .endm
  127. /* club the numeric 'id' to the symbol 'name' */
  128. .macro club name, id
  129. .altmacro
  130. .if \name == DDQ_DATA
  131. setddq %\id
  132. .elseif \name == XDATA
  133. setxdata %\id
  134. .endif
  135. .noaltmacro
  136. .endm
  137. /*
  138. * do_aes num_in_par load_keys key_len
  139. * This increments p_in, but not p_out
  140. */
  141. .macro do_aes b, k, key_len
  142. .set by, \b
  143. .set load_keys, \k
  144. .set klen, \key_len
  145. .if (load_keys)
  146. vmovdqa 0*16(p_keys), xkey0
  147. .endif
  148. vpshufb xbyteswap, xcounter, xdata0
  149. .set i, 1
  150. .rept (by - 1)
  151. club DDQ_DATA, i
  152. club XDATA, i
  153. vpaddq var_ddq_add(%rip), xcounter, var_xdata
  154. vptest ddq_low_msk(%rip), var_xdata
  155. jnz 1f
  156. vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
  157. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  158. 1:
  159. vpshufb xbyteswap, var_xdata, var_xdata
  160. .set i, (i +1)
  161. .endr
  162. vmovdqa 1*16(p_keys), xkeyA
  163. vpxor xkey0, xdata0, xdata0
  164. club DDQ_DATA, by
  165. vpaddq var_ddq_add(%rip), xcounter, xcounter
  166. vptest ddq_low_msk(%rip), xcounter
  167. jnz 1f
  168. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  169. 1:
  170. .set i, 1
  171. .rept (by - 1)
  172. club XDATA, i
  173. vpxor xkey0, var_xdata, var_xdata
  174. .set i, (i +1)
  175. .endr
  176. vmovdqa 2*16(p_keys), xkeyB
  177. .set i, 0
  178. .rept by
  179. club XDATA, i
  180. vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
  181. .set i, (i +1)
  182. .endr
  183. .if (klen == KEY_128)
  184. .if (load_keys)
  185. vmovdqa 3*16(p_keys), xkey4
  186. .endif
  187. .else
  188. vmovdqa 3*16(p_keys), xkeyA
  189. .endif
  190. .set i, 0
  191. .rept by
  192. club XDATA, i
  193. vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
  194. .set i, (i +1)
  195. .endr
  196. add $(16*by), p_in
  197. .if (klen == KEY_128)
  198. vmovdqa 4*16(p_keys), xkeyB
  199. .else
  200. .if (load_keys)
  201. vmovdqa 4*16(p_keys), xkey4
  202. .endif
  203. .endif
  204. .set i, 0
  205. .rept by
  206. club XDATA, i
  207. /* key 3 */
  208. .if (klen == KEY_128)
  209. vaesenc xkey4, var_xdata, var_xdata
  210. .else
  211. vaesenc xkeyA, var_xdata, var_xdata
  212. .endif
  213. .set i, (i +1)
  214. .endr
  215. vmovdqa 5*16(p_keys), xkeyA
  216. .set i, 0
  217. .rept by
  218. club XDATA, i
  219. /* key 4 */
  220. .if (klen == KEY_128)
  221. vaesenc xkeyB, var_xdata, var_xdata
  222. .else
  223. vaesenc xkey4, var_xdata, var_xdata
  224. .endif
  225. .set i, (i +1)
  226. .endr
  227. .if (klen == KEY_128)
  228. .if (load_keys)
  229. vmovdqa 6*16(p_keys), xkey8
  230. .endif
  231. .else
  232. vmovdqa 6*16(p_keys), xkeyB
  233. .endif
  234. .set i, 0
  235. .rept by
  236. club XDATA, i
  237. vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
  238. .set i, (i +1)
  239. .endr
  240. vmovdqa 7*16(p_keys), xkeyA
  241. .set i, 0
  242. .rept by
  243. club XDATA, i
  244. /* key 6 */
  245. .if (klen == KEY_128)
  246. vaesenc xkey8, var_xdata, var_xdata
  247. .else
  248. vaesenc xkeyB, var_xdata, var_xdata
  249. .endif
  250. .set i, (i +1)
  251. .endr
  252. .if (klen == KEY_128)
  253. vmovdqa 8*16(p_keys), xkeyB
  254. .else
  255. .if (load_keys)
  256. vmovdqa 8*16(p_keys), xkey8
  257. .endif
  258. .endif
  259. .set i, 0
  260. .rept by
  261. club XDATA, i
  262. vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
  263. .set i, (i +1)
  264. .endr
  265. .if (klen == KEY_128)
  266. .if (load_keys)
  267. vmovdqa 9*16(p_keys), xkey12
  268. .endif
  269. .else
  270. vmovdqa 9*16(p_keys), xkeyA
  271. .endif
  272. .set i, 0
  273. .rept by
  274. club XDATA, i
  275. /* key 8 */
  276. .if (klen == KEY_128)
  277. vaesenc xkeyB, var_xdata, var_xdata
  278. .else
  279. vaesenc xkey8, var_xdata, var_xdata
  280. .endif
  281. .set i, (i +1)
  282. .endr
  283. vmovdqa 10*16(p_keys), xkeyB
  284. .set i, 0
  285. .rept by
  286. club XDATA, i
  287. /* key 9 */
  288. .if (klen == KEY_128)
  289. vaesenc xkey12, var_xdata, var_xdata
  290. .else
  291. vaesenc xkeyA, var_xdata, var_xdata
  292. .endif
  293. .set i, (i +1)
  294. .endr
  295. .if (klen != KEY_128)
  296. vmovdqa 11*16(p_keys), xkeyA
  297. .endif
  298. .set i, 0
  299. .rept by
  300. club XDATA, i
  301. /* key 10 */
  302. .if (klen == KEY_128)
  303. vaesenclast xkeyB, var_xdata, var_xdata
  304. .else
  305. vaesenc xkeyB, var_xdata, var_xdata
  306. .endif
  307. .set i, (i +1)
  308. .endr
  309. .if (klen != KEY_128)
  310. .if (load_keys)
  311. vmovdqa 12*16(p_keys), xkey12
  312. .endif
  313. .set i, 0
  314. .rept by
  315. club XDATA, i
  316. vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
  317. .set i, (i +1)
  318. .endr
  319. .if (klen == KEY_256)
  320. vmovdqa 13*16(p_keys), xkeyA
  321. .endif
  322. .set i, 0
  323. .rept by
  324. club XDATA, i
  325. .if (klen == KEY_256)
  326. /* key 12 */
  327. vaesenc xkey12, var_xdata, var_xdata
  328. .else
  329. vaesenclast xkey12, var_xdata, var_xdata
  330. .endif
  331. .set i, (i +1)
  332. .endr
  333. .if (klen == KEY_256)
  334. vmovdqa 14*16(p_keys), xkeyB
  335. .set i, 0
  336. .rept by
  337. club XDATA, i
  338. /* key 13 */
  339. vaesenc xkeyA, var_xdata, var_xdata
  340. .set i, (i +1)
  341. .endr
  342. .set i, 0
  343. .rept by
  344. club XDATA, i
  345. /* key 14 */
  346. vaesenclast xkeyB, var_xdata, var_xdata
  347. .set i, (i +1)
  348. .endr
  349. .endif
  350. .endif
  351. .set i, 0
  352. .rept (by / 2)
  353. .set j, (i+1)
  354. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  355. VMOVDQ (j*16 - 16*by)(p_in), xkeyB
  356. club XDATA, i
  357. vpxor xkeyA, var_xdata, var_xdata
  358. club XDATA, j
  359. vpxor xkeyB, var_xdata, var_xdata
  360. .set i, (i+2)
  361. .endr
  362. .if (i < by)
  363. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  364. club XDATA, i
  365. vpxor xkeyA, var_xdata, var_xdata
  366. .endif
  367. .set i, 0
  368. .rept by
  369. club XDATA, i
  370. VMOVDQ var_xdata, i*16(p_out)
  371. .set i, (i+1)
  372. .endr
  373. .endm
  374. .macro do_aes_load val, key_len
  375. do_aes \val, 1, \key_len
  376. .endm
  377. .macro do_aes_noload val, key_len
  378. do_aes \val, 0, \key_len
  379. .endm
  380. /* main body of aes ctr load */
  381. .macro do_aes_ctrmain key_len
  382. cmp $16, num_bytes
  383. jb .Ldo_return2\key_len
  384. vmovdqa byteswap_const(%rip), xbyteswap
  385. vmovdqu (p_iv), xcounter
  386. vpshufb xbyteswap, xcounter, xcounter
  387. mov num_bytes, tmp
  388. and $(7*16), tmp
  389. jz .Lmult_of_8_blks\key_len
  390. /* 1 <= tmp <= 7 */
  391. cmp $(4*16), tmp
  392. jg .Lgt4\key_len
  393. je .Leq4\key_len
  394. .Llt4\key_len:
  395. cmp $(2*16), tmp
  396. jg .Leq3\key_len
  397. je .Leq2\key_len
  398. .Leq1\key_len:
  399. do_aes_load 1, \key_len
  400. add $(1*16), p_out
  401. and $(~7*16), num_bytes
  402. jz .Ldo_return2\key_len
  403. jmp .Lmain_loop2\key_len
  404. .Leq2\key_len:
  405. do_aes_load 2, \key_len
  406. add $(2*16), p_out
  407. and $(~7*16), num_bytes
  408. jz .Ldo_return2\key_len
  409. jmp .Lmain_loop2\key_len
  410. .Leq3\key_len:
  411. do_aes_load 3, \key_len
  412. add $(3*16), p_out
  413. and $(~7*16), num_bytes
  414. jz .Ldo_return2\key_len
  415. jmp .Lmain_loop2\key_len
  416. .Leq4\key_len:
  417. do_aes_load 4, \key_len
  418. add $(4*16), p_out
  419. and $(~7*16), num_bytes
  420. jz .Ldo_return2\key_len
  421. jmp .Lmain_loop2\key_len
  422. .Lgt4\key_len:
  423. cmp $(6*16), tmp
  424. jg .Leq7\key_len
  425. je .Leq6\key_len
  426. .Leq5\key_len:
  427. do_aes_load 5, \key_len
  428. add $(5*16), p_out
  429. and $(~7*16), num_bytes
  430. jz .Ldo_return2\key_len
  431. jmp .Lmain_loop2\key_len
  432. .Leq6\key_len:
  433. do_aes_load 6, \key_len
  434. add $(6*16), p_out
  435. and $(~7*16), num_bytes
  436. jz .Ldo_return2\key_len
  437. jmp .Lmain_loop2\key_len
  438. .Leq7\key_len:
  439. do_aes_load 7, \key_len
  440. add $(7*16), p_out
  441. and $(~7*16), num_bytes
  442. jz .Ldo_return2\key_len
  443. jmp .Lmain_loop2\key_len
  444. .Lmult_of_8_blks\key_len:
  445. .if (\key_len != KEY_128)
  446. vmovdqa 0*16(p_keys), xkey0
  447. vmovdqa 4*16(p_keys), xkey4
  448. vmovdqa 8*16(p_keys), xkey8
  449. vmovdqa 12*16(p_keys), xkey12
  450. .else
  451. vmovdqa 0*16(p_keys), xkey0
  452. vmovdqa 3*16(p_keys), xkey4
  453. vmovdqa 6*16(p_keys), xkey8
  454. vmovdqa 9*16(p_keys), xkey12
  455. .endif
  456. .align 16
  457. .Lmain_loop2\key_len:
  458. /* num_bytes is a multiple of 8 and >0 */
  459. do_aes_noload 8, \key_len
  460. add $(8*16), p_out
  461. sub $(8*16), num_bytes
  462. jne .Lmain_loop2\key_len
  463. .Ldo_return2\key_len:
  464. /* return updated IV */
  465. vpshufb xbyteswap, xcounter, xcounter
  466. vmovdqu xcounter, (p_iv)
  467. ret
  468. .endm
  469. /*
  470. * routine to do AES128 CTR enc/decrypt "by8"
  471. * XMM registers are clobbered.
  472. * Saving/restoring must be done at a higher level
  473. * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
  474. * unsigned int num_bytes)
  475. */
  476. ENTRY(aes_ctr_enc_128_avx_by8)
  477. /* call the aes main loop */
  478. do_aes_ctrmain KEY_128
  479. ENDPROC(aes_ctr_enc_128_avx_by8)
  480. /*
  481. * routine to do AES192 CTR enc/decrypt "by8"
  482. * XMM registers are clobbered.
  483. * Saving/restoring must be done at a higher level
  484. * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
  485. * unsigned int num_bytes)
  486. */
  487. ENTRY(aes_ctr_enc_192_avx_by8)
  488. /* call the aes main loop */
  489. do_aes_ctrmain KEY_192
  490. ENDPROC(aes_ctr_enc_192_avx_by8)
  491. /*
  492. * routine to do AES256 CTR enc/decrypt "by8"
  493. * XMM registers are clobbered.
  494. * Saving/restoring must be done at a higher level
  495. * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
  496. * unsigned int num_bytes)
  497. */
  498. ENTRY(aes_ctr_enc_256_avx_by8)
  499. /* call the aes main loop */
  500. do_aes_ctrmain KEY_256
  501. ENDPROC(aes_ctr_enc_256_avx_by8)