sha256-avx2-asm.S 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. ########################################################################
  2. # Implement fast SHA-256 with AVX2 instructions. (x86_64)
  3. #
  4. # Copyright (C) 2013 Intel Corporation.
  5. #
  6. # Authors:
  7. # James Guilford <james.guilford@intel.com>
  8. # Kirk Yap <kirk.s.yap@intel.com>
  9. # Tim Chen <tim.c.chen@linux.intel.com>
  10. #
  11. # This software is available to you under a choice of one of two
  12. # licenses. You may choose to be licensed under the terms of the GNU
  13. # General Public License (GPL) Version 2, available from the file
  14. # COPYING in the main directory of this source tree, or the
  15. # OpenIB.org BSD license below:
  16. #
  17. # Redistribution and use in source and binary forms, with or
  18. # without modification, are permitted provided that the following
  19. # conditions are met:
  20. #
  21. # - Redistributions of source code must retain the above
  22. # copyright notice, this list of conditions and the following
  23. # disclaimer.
  24. #
  25. # - Redistributions in binary form must reproduce the above
  26. # copyright notice, this list of conditions and the following
  27. # disclaimer in the documentation and/or other materials
  28. # provided with the distribution.
  29. #
  30. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37. # SOFTWARE.
  38. #
  39. ########################################################################
  40. #
  41. # This code is described in an Intel White-Paper:
  42. # "Fast SHA-256 Implementations on Intel Architecture Processors"
  43. #
  44. # To find it, surf to http://www.intel.com/p/en_US/embedded
  45. # and search for that title.
  46. #
  47. ########################################################################
  48. # This code schedules 2 blocks at a time, with 4 lanes per block
  49. ########################################################################
  50. #ifdef CONFIG_AS_AVX2
  51. #include <linux/linkage.h>
  52. ## assume buffers not aligned
  53. #define VMOVDQ vmovdqu
  54. ################################ Define Macros
  55. # addm [mem], reg
  56. # Add reg to mem using reg-mem add and store
  57. .macro addm p1 p2
  58. add \p1, \p2
  59. mov \p2, \p1
  60. .endm
  61. ################################
  62. X0 = %ymm4
  63. X1 = %ymm5
  64. X2 = %ymm6
  65. X3 = %ymm7
  66. # XMM versions of above
  67. XWORD0 = %xmm4
  68. XWORD1 = %xmm5
  69. XWORD2 = %xmm6
  70. XWORD3 = %xmm7
  71. XTMP0 = %ymm0
  72. XTMP1 = %ymm1
  73. XTMP2 = %ymm2
  74. XTMP3 = %ymm3
  75. XTMP4 = %ymm8
  76. XFER = %ymm9
  77. XTMP5 = %ymm11
  78. SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
  79. SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
  80. BYTE_FLIP_MASK = %ymm13
  81. X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  82. NUM_BLKS = %rdx # 3rd arg
  83. INP = %rsi # 2nd arg
  84. CTX = %rdi # 1st arg
  85. c = %ecx
  86. d = %r8d
  87. e = %edx # clobbers NUM_BLKS
  88. y3 = %esi # clobbers INP
  89. TBL = %rbp
  90. SRND = CTX # SRND is same register as CTX
  91. a = %eax
  92. b = %ebx
  93. f = %r9d
  94. g = %r10d
  95. h = %r11d
  96. old_h = %r11d
  97. T1 = %r12d
  98. y0 = %r13d
  99. y1 = %r14d
  100. y2 = %r15d
  101. _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
  102. _XMM_SAVE_SIZE = 0
  103. _INP_END_SIZE = 8
  104. _INP_SIZE = 8
  105. _CTX_SIZE = 8
  106. _RSP_SIZE = 8
  107. _XFER = 0
  108. _XMM_SAVE = _XFER + _XFER_SIZE
  109. _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
  110. _INP = _INP_END + _INP_END_SIZE
  111. _CTX = _INP + _INP_SIZE
  112. _RSP = _CTX + _CTX_SIZE
  113. STACK_SIZE = _RSP + _RSP_SIZE
  114. # rotate_Xs
  115. # Rotate values of symbols X0...X3
  116. .macro rotate_Xs
  117. X_ = X0
  118. X0 = X1
  119. X1 = X2
  120. X2 = X3
  121. X3 = X_
  122. .endm
  123. # ROTATE_ARGS
  124. # Rotate values of symbols a...h
  125. .macro ROTATE_ARGS
  126. old_h = h
  127. TMP_ = h
  128. h = g
  129. g = f
  130. f = e
  131. e = d
  132. d = c
  133. c = b
  134. b = a
  135. a = TMP_
  136. .endm
  137. .macro FOUR_ROUNDS_AND_SCHED disp
  138. ################################### RND N + 0 ############################
  139. mov a, y3 # y3 = a # MAJA
  140. rorx $25, e, y0 # y0 = e >> 25 # S1A
  141. rorx $11, e, y1 # y1 = e >> 11 # S1B
  142. addl \disp(%rsp, SRND), h # h = k + w + h # --
  143. or c, y3 # y3 = a|c # MAJA
  144. vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
  145. mov f, y2 # y2 = f # CH
  146. rorx $13, a, T1 # T1 = a >> 13 # S0B
  147. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  148. xor g, y2 # y2 = f^g # CH
  149. vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
  150. rorx $6, e, y1 # y1 = (e >> 6) # S1
  151. and e, y2 # y2 = (f^g)&e # CH
  152. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  153. rorx $22, a, y1 # y1 = a >> 22 # S0A
  154. add h, d # d = k + w + h + d # --
  155. and b, y3 # y3 = (a|c)&b # MAJA
  156. vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
  157. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  158. rorx $2, a, T1 # T1 = (a >> 2) # S0
  159. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  160. vpsrld $7, XTMP1, XTMP2
  161. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  162. mov a, T1 # T1 = a # MAJB
  163. and c, T1 # T1 = a&c # MAJB
  164. add y0, y2 # y2 = S1 + CH # --
  165. vpslld $(32-7), XTMP1, XTMP3
  166. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  167. add y1, h # h = k + w + h + S0 # --
  168. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  169. vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
  170. vpsrld $18, XTMP1, XTMP2
  171. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  172. add y3, h # h = t1 + S0 + MAJ # --
  173. ROTATE_ARGS
  174. ################################### RND N + 1 ############################
  175. mov a, y3 # y3 = a # MAJA
  176. rorx $25, e, y0 # y0 = e >> 25 # S1A
  177. rorx $11, e, y1 # y1 = e >> 11 # S1B
  178. offset = \disp + 1*4
  179. addl offset(%rsp, SRND), h # h = k + w + h # --
  180. or c, y3 # y3 = a|c # MAJA
  181. vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
  182. mov f, y2 # y2 = f # CH
  183. rorx $13, a, T1 # T1 = a >> 13 # S0B
  184. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  185. xor g, y2 # y2 = f^g # CH
  186. rorx $6, e, y1 # y1 = (e >> 6) # S1
  187. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  188. rorx $22, a, y1 # y1 = a >> 22 # S0A
  189. and e, y2 # y2 = (f^g)&e # CH
  190. add h, d # d = k + w + h + d # --
  191. vpslld $(32-18), XTMP1, XTMP1
  192. and b, y3 # y3 = (a|c)&b # MAJA
  193. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  194. vpxor XTMP1, XTMP3, XTMP3
  195. rorx $2, a, T1 # T1 = (a >> 2) # S0
  196. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  197. vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
  198. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  199. mov a, T1 # T1 = a # MAJB
  200. and c, T1 # T1 = a&c # MAJB
  201. add y0, y2 # y2 = S1 + CH # --
  202. vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
  203. vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
  204. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  205. add y1, h # h = k + w + h + S0 # --
  206. vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
  207. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  208. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  209. add y3, h # h = t1 + S0 + MAJ # --
  210. vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
  211. ROTATE_ARGS
  212. ################################### RND N + 2 ############################
  213. mov a, y3 # y3 = a # MAJA
  214. rorx $25, e, y0 # y0 = e >> 25 # S1A
  215. offset = \disp + 2*4
  216. addl offset(%rsp, SRND), h # h = k + w + h # --
  217. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
  218. rorx $11, e, y1 # y1 = e >> 11 # S1B
  219. or c, y3 # y3 = a|c # MAJA
  220. mov f, y2 # y2 = f # CH
  221. xor g, y2 # y2 = f^g # CH
  222. rorx $13, a, T1 # T1 = a >> 13 # S0B
  223. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  224. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
  225. and e, y2 # y2 = (f^g)&e # CH
  226. rorx $6, e, y1 # y1 = (e >> 6) # S1
  227. vpxor XTMP3, XTMP2, XTMP2
  228. add h, d # d = k + w + h + d # --
  229. and b, y3 # y3 = (a|c)&b # MAJA
  230. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  231. rorx $22, a, y1 # y1 = a >> 22 # S0A
  232. vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
  233. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  234. vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
  235. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  236. rorx $2, a ,T1 # T1 = (a >> 2) # S0
  237. vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
  238. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  239. mov a, T1 # T1 = a # MAJB
  240. and c, T1 # T1 = a&c # MAJB
  241. add y0, y2 # y2 = S1 + CH # --
  242. vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
  243. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  244. add y1,h # h = k + w + h + S0 # --
  245. add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
  246. add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  247. add y3,h # h = t1 + S0 + MAJ # --
  248. ROTATE_ARGS
  249. ################################### RND N + 3 ############################
  250. mov a, y3 # y3 = a # MAJA
  251. rorx $25, e, y0 # y0 = e >> 25 # S1A
  252. rorx $11, e, y1 # y1 = e >> 11 # S1B
  253. offset = \disp + 3*4
  254. addl offset(%rsp, SRND), h # h = k + w + h # --
  255. or c, y3 # y3 = a|c # MAJA
  256. vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
  257. mov f, y2 # y2 = f # CH
  258. rorx $13, a, T1 # T1 = a >> 13 # S0B
  259. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  260. xor g, y2 # y2 = f^g # CH
  261. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
  262. rorx $6, e, y1 # y1 = (e >> 6) # S1
  263. and e, y2 # y2 = (f^g)&e # CH
  264. add h, d # d = k + w + h + d # --
  265. and b, y3 # y3 = (a|c)&b # MAJA
  266. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
  267. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  268. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  269. vpxor XTMP3, XTMP2, XTMP2
  270. rorx $22, a, y1 # y1 = a >> 22 # S0A
  271. add y0, y2 # y2 = S1 + CH # --
  272. vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
  273. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  274. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  275. rorx $2, a, T1 # T1 = (a >> 2) # S0
  276. vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
  277. vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
  278. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  279. mov a, T1 # T1 = a # MAJB
  280. and c, T1 # T1 = a&c # MAJB
  281. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  282. add y1, h # h = k + w + h + S0 # --
  283. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  284. add y3, h # h = t1 + S0 + MAJ # --
  285. ROTATE_ARGS
  286. rotate_Xs
  287. .endm
  288. .macro DO_4ROUNDS disp
  289. ################################### RND N + 0 ###########################
  290. mov f, y2 # y2 = f # CH
  291. rorx $25, e, y0 # y0 = e >> 25 # S1A
  292. rorx $11, e, y1 # y1 = e >> 11 # S1B
  293. xor g, y2 # y2 = f^g # CH
  294. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  295. rorx $6, e, y1 # y1 = (e >> 6) # S1
  296. and e, y2 # y2 = (f^g)&e # CH
  297. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  298. rorx $13, a, T1 # T1 = a >> 13 # S0B
  299. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  300. rorx $22, a, y1 # y1 = a >> 22 # S0A
  301. mov a, y3 # y3 = a # MAJA
  302. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  303. rorx $2, a, T1 # T1 = (a >> 2) # S0
  304. addl \disp(%rsp, SRND), h # h = k + w + h # --
  305. or c, y3 # y3 = a|c # MAJA
  306. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  307. mov a, T1 # T1 = a # MAJB
  308. and b, y3 # y3 = (a|c)&b # MAJA
  309. and c, T1 # T1 = a&c # MAJB
  310. add y0, y2 # y2 = S1 + CH # --
  311. add h, d # d = k + w + h + d # --
  312. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  313. add y1, h # h = k + w + h + S0 # --
  314. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  315. ROTATE_ARGS
  316. ################################### RND N + 1 ###########################
  317. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  318. mov f, y2 # y2 = f # CH
  319. rorx $25, e, y0 # y0 = e >> 25 # S1A
  320. rorx $11, e, y1 # y1 = e >> 11 # S1B
  321. xor g, y2 # y2 = f^g # CH
  322. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  323. rorx $6, e, y1 # y1 = (e >> 6) # S1
  324. and e, y2 # y2 = (f^g)&e # CH
  325. add y3, old_h # h = t1 + S0 + MAJ # --
  326. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  327. rorx $13, a, T1 # T1 = a >> 13 # S0B
  328. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  329. rorx $22, a, y1 # y1 = a >> 22 # S0A
  330. mov a, y3 # y3 = a # MAJA
  331. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  332. rorx $2, a, T1 # T1 = (a >> 2) # S0
  333. offset = 4*1 + \disp
  334. addl offset(%rsp, SRND), h # h = k + w + h # --
  335. or c, y3 # y3 = a|c # MAJA
  336. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  337. mov a, T1 # T1 = a # MAJB
  338. and b, y3 # y3 = (a|c)&b # MAJA
  339. and c, T1 # T1 = a&c # MAJB
  340. add y0, y2 # y2 = S1 + CH # --
  341. add h, d # d = k + w + h + d # --
  342. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  343. add y1, h # h = k + w + h + S0 # --
  344. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  345. ROTATE_ARGS
  346. ################################### RND N + 2 ##############################
  347. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  348. mov f, y2 # y2 = f # CH
  349. rorx $25, e, y0 # y0 = e >> 25 # S1A
  350. rorx $11, e, y1 # y1 = e >> 11 # S1B
  351. xor g, y2 # y2 = f^g # CH
  352. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  353. rorx $6, e, y1 # y1 = (e >> 6) # S1
  354. and e, y2 # y2 = (f^g)&e # CH
  355. add y3, old_h # h = t1 + S0 + MAJ # --
  356. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  357. rorx $13, a, T1 # T1 = a >> 13 # S0B
  358. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  359. rorx $22, a, y1 # y1 = a >> 22 # S0A
  360. mov a, y3 # y3 = a # MAJA
  361. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  362. rorx $2, a, T1 # T1 = (a >> 2) # S0
  363. offset = 4*2 + \disp
  364. addl offset(%rsp, SRND), h # h = k + w + h # --
  365. or c, y3 # y3 = a|c # MAJA
  366. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  367. mov a, T1 # T1 = a # MAJB
  368. and b, y3 # y3 = (a|c)&b # MAJA
  369. and c, T1 # T1 = a&c # MAJB
  370. add y0, y2 # y2 = S1 + CH # --
  371. add h, d # d = k + w + h + d # --
  372. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  373. add y1, h # h = k + w + h + S0 # --
  374. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  375. ROTATE_ARGS
  376. ################################### RND N + 3 ###########################
  377. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  378. mov f, y2 # y2 = f # CH
  379. rorx $25, e, y0 # y0 = e >> 25 # S1A
  380. rorx $11, e, y1 # y1 = e >> 11 # S1B
  381. xor g, y2 # y2 = f^g # CH
  382. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  383. rorx $6, e, y1 # y1 = (e >> 6) # S1
  384. and e, y2 # y2 = (f^g)&e # CH
  385. add y3, old_h # h = t1 + S0 + MAJ # --
  386. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  387. rorx $13, a, T1 # T1 = a >> 13 # S0B
  388. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  389. rorx $22, a, y1 # y1 = a >> 22 # S0A
  390. mov a, y3 # y3 = a # MAJA
  391. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  392. rorx $2, a, T1 # T1 = (a >> 2) # S0
  393. offset = 4*3 + \disp
  394. addl offset(%rsp, SRND), h # h = k + w + h # --
  395. or c, y3 # y3 = a|c # MAJA
  396. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  397. mov a, T1 # T1 = a # MAJB
  398. and b, y3 # y3 = (a|c)&b # MAJA
  399. and c, T1 # T1 = a&c # MAJB
  400. add y0, y2 # y2 = S1 + CH # --
  401. add h, d # d = k + w + h + d # --
  402. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  403. add y1, h # h = k + w + h + S0 # --
  404. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  405. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  406. add y3, h # h = t1 + S0 + MAJ # --
  407. ROTATE_ARGS
  408. .endm
  409. ########################################################################
  410. ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
  411. ## arg 1 : pointer to digest
  412. ## arg 2 : pointer to input data
  413. ## arg 3 : Num blocks
  414. ########################################################################
  415. .text
  416. ENTRY(sha256_transform_rorx)
  417. .align 32
  418. pushq %rbx
  419. pushq %rbp
  420. pushq %r12
  421. pushq %r13
  422. pushq %r14
  423. pushq %r15
  424. mov %rsp, %rax
  425. subq $STACK_SIZE, %rsp
  426. and $-32, %rsp # align rsp to 32 byte boundary
  427. mov %rax, _RSP(%rsp)
  428. shl $6, NUM_BLKS # convert to bytes
  429. jz done_hash
  430. lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
  431. mov NUM_BLKS, _INP_END(%rsp)
  432. cmp NUM_BLKS, INP
  433. je only_one_block
  434. ## load initial digest
  435. mov (CTX), a
  436. mov 4*1(CTX), b
  437. mov 4*2(CTX), c
  438. mov 4*3(CTX), d
  439. mov 4*4(CTX), e
  440. mov 4*5(CTX), f
  441. mov 4*6(CTX), g
  442. mov 4*7(CTX), h
  443. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  444. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  445. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  446. mov CTX, _CTX(%rsp)
  447. loop0:
  448. lea K256(%rip), TBL
  449. ## Load first 16 dwords from two blocks
  450. VMOVDQ 0*32(INP),XTMP0
  451. VMOVDQ 1*32(INP),XTMP1
  452. VMOVDQ 2*32(INP),XTMP2
  453. VMOVDQ 3*32(INP),XTMP3
  454. ## byte swap data
  455. vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
  456. vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
  457. vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
  458. vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
  459. ## transpose data into high/low halves
  460. vperm2i128 $0x20, XTMP2, XTMP0, X0
  461. vperm2i128 $0x31, XTMP2, XTMP0, X1
  462. vperm2i128 $0x20, XTMP3, XTMP1, X2
  463. vperm2i128 $0x31, XTMP3, XTMP1, X3
  464. last_block_enter:
  465. add $64, INP
  466. mov INP, _INP(%rsp)
  467. ## schedule 48 input dwords, by doing 3 rounds of 12 each
  468. xor SRND, SRND
  469. .align 16
  470. loop1:
  471. vpaddd 0*32(TBL, SRND), X0, XFER
  472. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  473. FOUR_ROUNDS_AND_SCHED _XFER + 0*32
  474. vpaddd 1*32(TBL, SRND), X0, XFER
  475. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  476. FOUR_ROUNDS_AND_SCHED _XFER + 1*32
  477. vpaddd 2*32(TBL, SRND), X0, XFER
  478. vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
  479. FOUR_ROUNDS_AND_SCHED _XFER + 2*32
  480. vpaddd 3*32(TBL, SRND), X0, XFER
  481. vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
  482. FOUR_ROUNDS_AND_SCHED _XFER + 3*32
  483. add $4*32, SRND
  484. cmp $3*4*32, SRND
  485. jb loop1
  486. loop2:
  487. ## Do last 16 rounds with no scheduling
  488. vpaddd 0*32(TBL, SRND), X0, XFER
  489. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  490. DO_4ROUNDS _XFER + 0*32
  491. vpaddd 1*32(TBL, SRND), X1, XFER
  492. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  493. DO_4ROUNDS _XFER + 1*32
  494. add $2*32, SRND
  495. vmovdqa X2, X0
  496. vmovdqa X3, X1
  497. cmp $4*4*32, SRND
  498. jb loop2
  499. mov _CTX(%rsp), CTX
  500. mov _INP(%rsp), INP
  501. addm (4*0)(CTX),a
  502. addm (4*1)(CTX),b
  503. addm (4*2)(CTX),c
  504. addm (4*3)(CTX),d
  505. addm (4*4)(CTX),e
  506. addm (4*5)(CTX),f
  507. addm (4*6)(CTX),g
  508. addm (4*7)(CTX),h
  509. cmp _INP_END(%rsp), INP
  510. ja done_hash
  511. #### Do second block using previously scheduled results
  512. xor SRND, SRND
  513. .align 16
  514. loop3:
  515. DO_4ROUNDS _XFER + 0*32 + 16
  516. DO_4ROUNDS _XFER + 1*32 + 16
  517. add $2*32, SRND
  518. cmp $4*4*32, SRND
  519. jb loop3
  520. mov _CTX(%rsp), CTX
  521. mov _INP(%rsp), INP
  522. add $64, INP
  523. addm (4*0)(CTX),a
  524. addm (4*1)(CTX),b
  525. addm (4*2)(CTX),c
  526. addm (4*3)(CTX),d
  527. addm (4*4)(CTX),e
  528. addm (4*5)(CTX),f
  529. addm (4*6)(CTX),g
  530. addm (4*7)(CTX),h
  531. cmp _INP_END(%rsp), INP
  532. jb loop0
  533. ja done_hash
  534. do_last_block:
  535. #### do last block
  536. lea K256(%rip), TBL
  537. VMOVDQ 0*16(INP),XWORD0
  538. VMOVDQ 1*16(INP),XWORD1
  539. VMOVDQ 2*16(INP),XWORD2
  540. VMOVDQ 3*16(INP),XWORD3
  541. vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
  542. vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
  543. vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
  544. vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
  545. jmp last_block_enter
  546. only_one_block:
  547. ## load initial digest
  548. mov (4*0)(CTX),a
  549. mov (4*1)(CTX),b
  550. mov (4*2)(CTX),c
  551. mov (4*3)(CTX),d
  552. mov (4*4)(CTX),e
  553. mov (4*5)(CTX),f
  554. mov (4*6)(CTX),g
  555. mov (4*7)(CTX),h
  556. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  557. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  558. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  559. mov CTX, _CTX(%rsp)
  560. jmp do_last_block
  561. done_hash:
  562. mov _RSP(%rsp), %rsp
  563. popq %r15
  564. popq %r14
  565. popq %r13
  566. popq %r12
  567. popq %rbp
  568. popq %rbx
  569. ret
  570. ENDPROC(sha256_transform_rorx)
  571. .data
  572. .align 64
  573. K256:
  574. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  575. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  576. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  577. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  578. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  579. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  580. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  581. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  582. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  583. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  584. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  585. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  586. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  587. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  588. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  589. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  590. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  591. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  592. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  593. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  594. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  595. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  596. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  597. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  598. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  599. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  600. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  601. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  602. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  603. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  604. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  605. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  606. PSHUFFLE_BYTE_FLIP_MASK:
  607. .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
  608. # shuffle xBxA -> 00BA
  609. _SHUF_00BA:
  610. .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
  611. # shuffle xDxC -> DC00
  612. _SHUF_DC00:
  613. .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
  614. #endif