sha1_avx2_x86_64_asm.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. /*
  2. * Implement fast SHA-1 with AVX2 instructions. (x86_64)
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * Ilya Albrekht <ilya.albrekht@intel.com>
  22. * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  23. * Ronen Zohar <ronen.zohar@intel.com>
  24. * Chandramouli Narayanan <mouli@linux.intel.com>
  25. *
  26. * BSD LICENSE
  27. *
  28. * Copyright(c) 2014 Intel Corporation.
  29. *
  30. * Redistribution and use in source and binary forms, with or without
  31. * modification, are permitted provided that the following conditions
  32. * are met:
  33. *
  34. * Redistributions of source code must retain the above copyright
  35. * notice, this list of conditions and the following disclaimer.
  36. * Redistributions in binary form must reproduce the above copyright
  37. * notice, this list of conditions and the following disclaimer in
  38. * the documentation and/or other materials provided with the
  39. * distribution.
  40. * Neither the name of Intel Corporation nor the names of its
  41. * contributors may be used to endorse or promote products derived
  42. * from this software without specific prior written permission.
  43. *
  44. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  45. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  46. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  47. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  48. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  50. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  54. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55. *
  56. */
  57. /*
  58. * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
  59. *
  60. *This implementation is based on the previous SSSE3 release:
  61. *Visit http://software.intel.com/en-us/articles/
  62. *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
  63. *
  64. *Updates 20-byte SHA-1 record in 'hash' for even number of
  65. *'num_blocks' consecutive 64-byte blocks
  66. *
  67. *extern "C" void sha1_transform_avx2(
  68. * int *hash, const char* input, size_t num_blocks );
  69. */
  70. #include <linux/linkage.h>
  71. #define CTX %rdi /* arg1 */
  72. #define BUF %rsi /* arg2 */
  73. #define CNT %rdx /* arg3 */
  74. #define REG_A %ecx
  75. #define REG_B %esi
  76. #define REG_C %edi
  77. #define REG_D %eax
  78. #define REG_E %edx
  79. #define REG_TB %ebx
  80. #define REG_TA %r12d
  81. #define REG_RA %rcx
  82. #define REG_RB %rsi
  83. #define REG_RC %rdi
  84. #define REG_RD %rax
  85. #define REG_RE %rdx
  86. #define REG_RTA %r12
  87. #define REG_RTB %rbx
  88. #define REG_T1 %ebp
  89. #define xmm_mov vmovups
  90. #define avx2_zeroupper vzeroupper
  91. #define RND_F1 1
  92. #define RND_F2 2
  93. #define RND_F3 3
  94. .macro REGALLOC
  95. .set A, REG_A
  96. .set B, REG_B
  97. .set C, REG_C
  98. .set D, REG_D
  99. .set E, REG_E
  100. .set TB, REG_TB
  101. .set TA, REG_TA
  102. .set RA, REG_RA
  103. .set RB, REG_RB
  104. .set RC, REG_RC
  105. .set RD, REG_RD
  106. .set RE, REG_RE
  107. .set RTA, REG_RTA
  108. .set RTB, REG_RTB
  109. .set T1, REG_T1
  110. .endm
  111. #define HASH_PTR %r9
  112. #define BLOCKS_CTR %r8
  113. #define BUFFER_PTR %r10
  114. #define BUFFER_PTR2 %r13
  115. #define PRECALC_BUF %r14
  116. #define WK_BUF %r15
  117. #define W_TMP %xmm0
  118. #define WY_TMP %ymm0
  119. #define WY_TMP2 %ymm9
  120. # AVX2 variables
  121. #define WY0 %ymm3
  122. #define WY4 %ymm5
  123. #define WY08 %ymm7
  124. #define WY12 %ymm8
  125. #define WY16 %ymm12
  126. #define WY20 %ymm13
  127. #define WY24 %ymm14
  128. #define WY28 %ymm15
  129. #define YMM_SHUFB_BSWAP %ymm10
  130. /*
  131. * Keep 2 iterations precalculated at a time:
  132. * - 80 DWORDs per iteration * 2
  133. */
  134. #define W_SIZE (80*2*2 +16)
  135. #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
  136. #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
  137. .macro UPDATE_HASH hash, val
  138. add \hash, \val
  139. mov \val, \hash
  140. .endm
  141. .macro PRECALC_RESET_WY
  142. .set WY_00, WY0
  143. .set WY_04, WY4
  144. .set WY_08, WY08
  145. .set WY_12, WY12
  146. .set WY_16, WY16
  147. .set WY_20, WY20
  148. .set WY_24, WY24
  149. .set WY_28, WY28
  150. .set WY_32, WY_00
  151. .endm
  152. .macro PRECALC_ROTATE_WY
  153. /* Rotate macros */
  154. .set WY_32, WY_28
  155. .set WY_28, WY_24
  156. .set WY_24, WY_20
  157. .set WY_20, WY_16
  158. .set WY_16, WY_12
  159. .set WY_12, WY_08
  160. .set WY_08, WY_04
  161. .set WY_04, WY_00
  162. .set WY_00, WY_32
  163. /* Define register aliases */
  164. .set WY, WY_00
  165. .set WY_minus_04, WY_04
  166. .set WY_minus_08, WY_08
  167. .set WY_minus_12, WY_12
  168. .set WY_minus_16, WY_16
  169. .set WY_minus_20, WY_20
  170. .set WY_minus_24, WY_24
  171. .set WY_minus_28, WY_28
  172. .set WY_minus_32, WY
  173. .endm
  174. .macro PRECALC_00_15
  175. .if (i == 0) # Initialize and rotate registers
  176. PRECALC_RESET_WY
  177. PRECALC_ROTATE_WY
  178. .endif
  179. /* message scheduling pre-compute for rounds 0-15 */
  180. .if ((i & 7) == 0)
  181. /*
  182. * blended AVX2 and ALU instruction scheduling
  183. * 1 vector iteration per 8 rounds
  184. */
  185. vmovdqu (i * 2)(BUFFER_PTR), W_TMP
  186. .elseif ((i & 7) == 1)
  187. vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
  188. WY_TMP, WY_TMP
  189. .elseif ((i & 7) == 2)
  190. vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
  191. .elseif ((i & 7) == 4)
  192. vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
  193. .elseif ((i & 7) == 7)
  194. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  195. PRECALC_ROTATE_WY
  196. .endif
  197. .endm
  198. .macro PRECALC_16_31
  199. /*
  200. * message scheduling pre-compute for rounds 16-31
  201. * calculating last 32 w[i] values in 8 XMM registers
  202. * pre-calculate K+w[i] values and store to mem
  203. * for later load by ALU add instruction
  204. *
  205. * "brute force" vectorization for rounds 16-31 only
  206. * due to w[i]->w[i-3] dependency
  207. */
  208. .if ((i & 7) == 0)
  209. /*
  210. * blended AVX2 and ALU instruction scheduling
  211. * 1 vector iteration per 8 rounds
  212. */
  213. /* w[i-14] */
  214. vpalignr $8, WY_minus_16, WY_minus_12, WY
  215. vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
  216. .elseif ((i & 7) == 1)
  217. vpxor WY_minus_08, WY, WY
  218. vpxor WY_minus_16, WY_TMP, WY_TMP
  219. .elseif ((i & 7) == 2)
  220. vpxor WY_TMP, WY, WY
  221. vpslldq $12, WY, WY_TMP2
  222. .elseif ((i & 7) == 3)
  223. vpslld $1, WY, WY_TMP
  224. vpsrld $31, WY, WY
  225. .elseif ((i & 7) == 4)
  226. vpor WY, WY_TMP, WY_TMP
  227. vpslld $2, WY_TMP2, WY
  228. .elseif ((i & 7) == 5)
  229. vpsrld $30, WY_TMP2, WY_TMP2
  230. vpxor WY, WY_TMP, WY_TMP
  231. .elseif ((i & 7) == 7)
  232. vpxor WY_TMP2, WY_TMP, WY
  233. vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
  234. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  235. PRECALC_ROTATE_WY
  236. .endif
  237. .endm
  238. .macro PRECALC_32_79
  239. /*
  240. * in SHA-1 specification:
  241. * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
  242. * instead we do equal:
  243. * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
  244. * allows more efficient vectorization
  245. * since w[i]=>w[i-3] dependency is broken
  246. */
  247. .if ((i & 7) == 0)
  248. /*
  249. * blended AVX2 and ALU instruction scheduling
  250. * 1 vector iteration per 8 rounds
  251. */
  252. vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
  253. .elseif ((i & 7) == 1)
  254. /* W is W_minus_32 before xor */
  255. vpxor WY_minus_28, WY, WY
  256. .elseif ((i & 7) == 2)
  257. vpxor WY_minus_16, WY_TMP, WY_TMP
  258. .elseif ((i & 7) == 3)
  259. vpxor WY_TMP, WY, WY
  260. .elseif ((i & 7) == 4)
  261. vpslld $2, WY, WY_TMP
  262. .elseif ((i & 7) == 5)
  263. vpsrld $30, WY, WY
  264. vpor WY, WY_TMP, WY
  265. .elseif ((i & 7) == 7)
  266. vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
  267. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  268. PRECALC_ROTATE_WY
  269. .endif
  270. .endm
  271. .macro PRECALC r, s
  272. .set i, \r
  273. .if (i < 40)
  274. .set K_XMM, 32*0
  275. .elseif (i < 80)
  276. .set K_XMM, 32*1
  277. .elseif (i < 120)
  278. .set K_XMM, 32*2
  279. .else
  280. .set K_XMM, 32*3
  281. .endif
  282. .if (i<32)
  283. PRECALC_00_15 \s
  284. .elseif (i<64)
  285. PRECALC_16_31 \s
  286. .elseif (i < 160)
  287. PRECALC_32_79 \s
  288. .endif
  289. .endm
  290. .macro ROTATE_STATE
  291. .set T_REG, E
  292. .set E, D
  293. .set D, C
  294. .set C, B
  295. .set B, TB
  296. .set TB, A
  297. .set A, T_REG
  298. .set T_REG, RE
  299. .set RE, RD
  300. .set RD, RC
  301. .set RC, RB
  302. .set RB, RTB
  303. .set RTB, RA
  304. .set RA, T_REG
  305. .endm
  306. /* Macro relies on saved ROUND_Fx */
  307. .macro RND_FUN f, r
  308. .if (\f == RND_F1)
  309. ROUND_F1 \r
  310. .elseif (\f == RND_F2)
  311. ROUND_F2 \r
  312. .elseif (\f == RND_F3)
  313. ROUND_F3 \r
  314. .endif
  315. .endm
  316. .macro RR r
  317. .set round_id, (\r % 80)
  318. .if (round_id == 0) /* Precalculate F for first round */
  319. .set ROUND_FUNC, RND_F1
  320. mov B, TB
  321. rorx $(32-30), B, B /* b>>>2 */
  322. andn D, TB, T1
  323. and C, TB
  324. xor T1, TB
  325. .endif
  326. RND_FUN ROUND_FUNC, \r
  327. ROTATE_STATE
  328. .if (round_id == 18)
  329. .set ROUND_FUNC, RND_F2
  330. .elseif (round_id == 38)
  331. .set ROUND_FUNC, RND_F3
  332. .elseif (round_id == 58)
  333. .set ROUND_FUNC, RND_F2
  334. .endif
  335. .set round_id, ( (\r+1) % 80)
  336. RND_FUN ROUND_FUNC, (\r+1)
  337. ROTATE_STATE
  338. .endm
  339. .macro ROUND_F1 r
  340. add WK(\r), E
  341. andn C, A, T1 /* ~b&d */
  342. lea (RE,RTB), E /* Add F from the previous round */
  343. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  344. rorx $(32-30),A, TB /* b>>>2 for next round */
  345. PRECALC (\r) /* msg scheduling for next 2 blocks */
  346. /*
  347. * Calculate F for the next round
  348. * (b & c) ^ andn[b, d]
  349. */
  350. and B, A /* b&c */
  351. xor T1, A /* F1 = (b&c) ^ (~b&d) */
  352. lea (RE,RTA), E /* E += A >>> 5 */
  353. .endm
  354. .macro ROUND_F2 r
  355. add WK(\r), E
  356. lea (RE,RTB), E /* Add F from the previous round */
  357. /* Calculate F for the next round */
  358. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  359. .if ((round_id) < 79)
  360. rorx $(32-30), A, TB /* b>>>2 for next round */
  361. .endif
  362. PRECALC (\r) /* msg scheduling for next 2 blocks */
  363. .if ((round_id) < 79)
  364. xor B, A
  365. .endif
  366. add TA, E /* E += A >>> 5 */
  367. .if ((round_id) < 79)
  368. xor C, A
  369. .endif
  370. .endm
  371. .macro ROUND_F3 r
  372. add WK(\r), E
  373. PRECALC (\r) /* msg scheduling for next 2 blocks */
  374. lea (RE,RTB), E /* Add F from the previous round */
  375. mov B, T1
  376. or A, T1
  377. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  378. rorx $(32-30), A, TB /* b>>>2 for next round */
  379. /* Calculate F for the next round
  380. * (b and c) or (d and (b or c))
  381. */
  382. and C, T1
  383. and B, A
  384. or T1, A
  385. add TA, E /* E += A >>> 5 */
  386. .endm
  387. /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
  388. * %1 + %2 >= %3 ? %4 : 0
  389. */
  390. .macro ADD_IF_GE a, b, c, d
  391. mov \a, RTA
  392. add $\d, RTA
  393. cmp $\c, \b
  394. cmovge RTA, \a
  395. .endm
  396. /*
  397. * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
  398. */
  399. .macro SHA1_PIPELINED_MAIN_BODY
  400. REGALLOC
  401. mov (HASH_PTR), A
  402. mov 4(HASH_PTR), B
  403. mov 8(HASH_PTR), C
  404. mov 12(HASH_PTR), D
  405. mov 16(HASH_PTR), E
  406. mov %rsp, PRECALC_BUF
  407. lea (2*4*80+32)(%rsp), WK_BUF
  408. # Precalc WK for first 2 blocks
  409. ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
  410. .set i, 0
  411. .rept 160
  412. PRECALC i
  413. .set i, i + 1
  414. .endr
  415. /* Go to next block if needed */
  416. ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
  417. ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
  418. xchg WK_BUF, PRECALC_BUF
  419. .align 32
  420. _loop:
  421. /*
  422. * code loops through more than one block
  423. * we use K_BASE value as a signal of a last block,
  424. * it is set below by: cmovae BUFFER_PTR, K_BASE
  425. */
  426. test BLOCKS_CTR, BLOCKS_CTR
  427. jnz _begin
  428. .align 32
  429. jmp _end
  430. .align 32
  431. _begin:
  432. /*
  433. * Do first block
  434. * rounds: 0,2,4,6,8
  435. */
  436. .set j, 0
  437. .rept 5
  438. RR j
  439. .set j, j+2
  440. .endr
  441. jmp _loop0
  442. _loop0:
  443. /*
  444. * rounds:
  445. * 10,12,14,16,18
  446. * 20,22,24,26,28
  447. * 30,32,34,36,38
  448. * 40,42,44,46,48
  449. * 50,52,54,56,58
  450. */
  451. .rept 25
  452. RR j
  453. .set j, j+2
  454. .endr
  455. /* Update Counter */
  456. sub $1, BLOCKS_CTR
  457. /* Move to the next block only if needed*/
  458. ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
  459. /*
  460. * rounds
  461. * 60,62,64,66,68
  462. * 70,72,74,76,78
  463. */
  464. .rept 10
  465. RR j
  466. .set j, j+2
  467. .endr
  468. UPDATE_HASH (HASH_PTR), A
  469. UPDATE_HASH 4(HASH_PTR), TB
  470. UPDATE_HASH 8(HASH_PTR), C
  471. UPDATE_HASH 12(HASH_PTR), D
  472. UPDATE_HASH 16(HASH_PTR), E
  473. test BLOCKS_CTR, BLOCKS_CTR
  474. jz _loop
  475. mov TB, B
  476. /* Process second block */
  477. /*
  478. * rounds
  479. * 0+80, 2+80, 4+80, 6+80, 8+80
  480. * 10+80,12+80,14+80,16+80,18+80
  481. */
  482. .set j, 0
  483. .rept 10
  484. RR j+80
  485. .set j, j+2
  486. .endr
  487. jmp _loop1
  488. _loop1:
  489. /*
  490. * rounds
  491. * 20+80,22+80,24+80,26+80,28+80
  492. * 30+80,32+80,34+80,36+80,38+80
  493. */
  494. .rept 10
  495. RR j+80
  496. .set j, j+2
  497. .endr
  498. jmp _loop2
  499. _loop2:
  500. /*
  501. * rounds
  502. * 40+80,42+80,44+80,46+80,48+80
  503. * 50+80,52+80,54+80,56+80,58+80
  504. */
  505. .rept 10
  506. RR j+80
  507. .set j, j+2
  508. .endr
  509. /* update counter */
  510. sub $1, BLOCKS_CTR
  511. /* Move to the next block only if needed*/
  512. ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
  513. jmp _loop3
  514. _loop3:
  515. /*
  516. * rounds
  517. * 60+80,62+80,64+80,66+80,68+80
  518. * 70+80,72+80,74+80,76+80,78+80
  519. */
  520. .rept 10
  521. RR j+80
  522. .set j, j+2
  523. .endr
  524. UPDATE_HASH (HASH_PTR), A
  525. UPDATE_HASH 4(HASH_PTR), TB
  526. UPDATE_HASH 8(HASH_PTR), C
  527. UPDATE_HASH 12(HASH_PTR), D
  528. UPDATE_HASH 16(HASH_PTR), E
  529. /* Reset state for AVX2 reg permutation */
  530. mov A, TA
  531. mov TB, A
  532. mov C, TB
  533. mov E, C
  534. mov D, B
  535. mov TA, D
  536. REGALLOC
  537. xchg WK_BUF, PRECALC_BUF
  538. jmp _loop
  539. .align 32
  540. _end:
  541. .endm
  542. /*
  543. * macro implements SHA-1 function's body for several 64-byte blocks
  544. * param: function's name
  545. */
  546. .macro SHA1_VECTOR_ASM name
  547. ENTRY(\name)
  548. push %rbx
  549. push %rbp
  550. push %r12
  551. push %r13
  552. push %r14
  553. push %r15
  554. RESERVE_STACK = (W_SIZE*4 + 8+24)
  555. /* Align stack */
  556. mov %rsp, %rbx
  557. and $~(0x20-1), %rsp
  558. push %rbx
  559. sub $RESERVE_STACK, %rsp
  560. avx2_zeroupper
  561. /* Setup initial values */
  562. mov CTX, HASH_PTR
  563. mov BUF, BUFFER_PTR
  564. mov BUF, BUFFER_PTR2
  565. mov CNT, BLOCKS_CTR
  566. xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
  567. SHA1_PIPELINED_MAIN_BODY
  568. avx2_zeroupper
  569. add $RESERVE_STACK, %rsp
  570. pop %rsp
  571. pop %r15
  572. pop %r14
  573. pop %r13
  574. pop %r12
  575. pop %rbp
  576. pop %rbx
  577. ret
  578. ENDPROC(\name)
  579. .endm
  580. .section .rodata
  581. #define K1 0x5a827999
  582. #define K2 0x6ed9eba1
  583. #define K3 0x8f1bbcdc
  584. #define K4 0xca62c1d6
  585. .align 128
  586. K_XMM_AR:
  587. .long K1, K1, K1, K1
  588. .long K1, K1, K1, K1
  589. .long K2, K2, K2, K2
  590. .long K2, K2, K2, K2
  591. .long K3, K3, K3, K3
  592. .long K3, K3, K3, K3
  593. .long K4, K4, K4, K4
  594. .long K4, K4, K4, K4
  595. BSWAP_SHUFB_CTL:
  596. .long 0x00010203
  597. .long 0x04050607
  598. .long 0x08090a0b
  599. .long 0x0c0d0e0f
  600. .long 0x00010203
  601. .long 0x04050607
  602. .long 0x08090a0b
  603. .long 0x0c0d0e0f
  604. .text
  605. SHA1_VECTOR_ASM sha1_transform_avx2