chacha20-ssse3-x86_64.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. /*
  2. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .data
  13. .align 16
  14. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  15. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  16. CTRINC: .octa 0x00000003000000020000000100000000
  17. .text
  18. ENTRY(chacha20_block_xor_ssse3)
  19. # %rdi: Input state matrix, s
  20. # %rsi: 1 data block output, o
  21. # %rdx: 1 data block input, i
  22. # This function encrypts one ChaCha20 block by loading the state matrix
  23. # in four SSE registers. It performs matrix operation on four words in
  24. # parallel, but requireds shuffling to rearrange the words after each
  25. # round. 8/16-bit word rotation is done with the slightly better
  26. # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
  27. # traditional shift+OR.
  28. # x0..3 = s0..3
  29. movdqa 0x00(%rdi),%xmm0
  30. movdqa 0x10(%rdi),%xmm1
  31. movdqa 0x20(%rdi),%xmm2
  32. movdqa 0x30(%rdi),%xmm3
  33. movdqa %xmm0,%xmm8
  34. movdqa %xmm1,%xmm9
  35. movdqa %xmm2,%xmm10
  36. movdqa %xmm3,%xmm11
  37. movdqa ROT8(%rip),%xmm4
  38. movdqa ROT16(%rip),%xmm5
  39. mov $10,%ecx
  40. .Ldoubleround:
  41. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  42. paddd %xmm1,%xmm0
  43. pxor %xmm0,%xmm3
  44. pshufb %xmm5,%xmm3
  45. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  46. paddd %xmm3,%xmm2
  47. pxor %xmm2,%xmm1
  48. movdqa %xmm1,%xmm6
  49. pslld $12,%xmm6
  50. psrld $20,%xmm1
  51. por %xmm6,%xmm1
  52. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  53. paddd %xmm1,%xmm0
  54. pxor %xmm0,%xmm3
  55. pshufb %xmm4,%xmm3
  56. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  57. paddd %xmm3,%xmm2
  58. pxor %xmm2,%xmm1
  59. movdqa %xmm1,%xmm7
  60. pslld $7,%xmm7
  61. psrld $25,%xmm1
  62. por %xmm7,%xmm1
  63. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  64. pshufd $0x39,%xmm1,%xmm1
  65. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  66. pshufd $0x4e,%xmm2,%xmm2
  67. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  68. pshufd $0x93,%xmm3,%xmm3
  69. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  70. paddd %xmm1,%xmm0
  71. pxor %xmm0,%xmm3
  72. pshufb %xmm5,%xmm3
  73. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  74. paddd %xmm3,%xmm2
  75. pxor %xmm2,%xmm1
  76. movdqa %xmm1,%xmm6
  77. pslld $12,%xmm6
  78. psrld $20,%xmm1
  79. por %xmm6,%xmm1
  80. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  81. paddd %xmm1,%xmm0
  82. pxor %xmm0,%xmm3
  83. pshufb %xmm4,%xmm3
  84. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  85. paddd %xmm3,%xmm2
  86. pxor %xmm2,%xmm1
  87. movdqa %xmm1,%xmm7
  88. pslld $7,%xmm7
  89. psrld $25,%xmm1
  90. por %xmm7,%xmm1
  91. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  92. pshufd $0x93,%xmm1,%xmm1
  93. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  94. pshufd $0x4e,%xmm2,%xmm2
  95. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  96. pshufd $0x39,%xmm3,%xmm3
  97. dec %ecx
  98. jnz .Ldoubleround
  99. # o0 = i0 ^ (x0 + s0)
  100. movdqu 0x00(%rdx),%xmm4
  101. paddd %xmm8,%xmm0
  102. pxor %xmm4,%xmm0
  103. movdqu %xmm0,0x00(%rsi)
  104. # o1 = i1 ^ (x1 + s1)
  105. movdqu 0x10(%rdx),%xmm5
  106. paddd %xmm9,%xmm1
  107. pxor %xmm5,%xmm1
  108. movdqu %xmm1,0x10(%rsi)
  109. # o2 = i2 ^ (x2 + s2)
  110. movdqu 0x20(%rdx),%xmm6
  111. paddd %xmm10,%xmm2
  112. pxor %xmm6,%xmm2
  113. movdqu %xmm2,0x20(%rsi)
  114. # o3 = i3 ^ (x3 + s3)
  115. movdqu 0x30(%rdx),%xmm7
  116. paddd %xmm11,%xmm3
  117. pxor %xmm7,%xmm3
  118. movdqu %xmm3,0x30(%rsi)
  119. ret
  120. ENDPROC(chacha20_block_xor_ssse3)
  121. ENTRY(chacha20_4block_xor_ssse3)
  122. # %rdi: Input state matrix, s
  123. # %rsi: 4 data blocks output, o
  124. # %rdx: 4 data blocks input, i
  125. # This function encrypts four consecutive ChaCha20 blocks by loading the
  126. # the state matrix in SSE registers four times. As we need some scratch
  127. # registers, we save the first four registers on the stack. The
  128. # algorithm performs each operation on the corresponding word of each
  129. # state matrix, hence requires no word shuffling. For final XORing step
  130. # we transpose the matrix by interleaving 32- and then 64-bit words,
  131. # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
  132. # done with the slightly better performing SSSE3 byte shuffling,
  133. # 7/12-bit word rotation uses traditional shift+OR.
  134. mov %rsp,%r11
  135. sub $0x80,%rsp
  136. and $~63,%rsp
  137. # x0..15[0-3] = s0..3[0..3]
  138. movq 0x00(%rdi),%xmm1
  139. pshufd $0x00,%xmm1,%xmm0
  140. pshufd $0x55,%xmm1,%xmm1
  141. movq 0x08(%rdi),%xmm3
  142. pshufd $0x00,%xmm3,%xmm2
  143. pshufd $0x55,%xmm3,%xmm3
  144. movq 0x10(%rdi),%xmm5
  145. pshufd $0x00,%xmm5,%xmm4
  146. pshufd $0x55,%xmm5,%xmm5
  147. movq 0x18(%rdi),%xmm7
  148. pshufd $0x00,%xmm7,%xmm6
  149. pshufd $0x55,%xmm7,%xmm7
  150. movq 0x20(%rdi),%xmm9
  151. pshufd $0x00,%xmm9,%xmm8
  152. pshufd $0x55,%xmm9,%xmm9
  153. movq 0x28(%rdi),%xmm11
  154. pshufd $0x00,%xmm11,%xmm10
  155. pshufd $0x55,%xmm11,%xmm11
  156. movq 0x30(%rdi),%xmm13
  157. pshufd $0x00,%xmm13,%xmm12
  158. pshufd $0x55,%xmm13,%xmm13
  159. movq 0x38(%rdi),%xmm15
  160. pshufd $0x00,%xmm15,%xmm14
  161. pshufd $0x55,%xmm15,%xmm15
  162. # x0..3 on stack
  163. movdqa %xmm0,0x00(%rsp)
  164. movdqa %xmm1,0x10(%rsp)
  165. movdqa %xmm2,0x20(%rsp)
  166. movdqa %xmm3,0x30(%rsp)
  167. movdqa CTRINC(%rip),%xmm1
  168. movdqa ROT8(%rip),%xmm2
  169. movdqa ROT16(%rip),%xmm3
  170. # x12 += counter values 0-3
  171. paddd %xmm1,%xmm12
  172. mov $10,%ecx
  173. .Ldoubleround4:
  174. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  175. movdqa 0x00(%rsp),%xmm0
  176. paddd %xmm4,%xmm0
  177. movdqa %xmm0,0x00(%rsp)
  178. pxor %xmm0,%xmm12
  179. pshufb %xmm3,%xmm12
  180. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  181. movdqa 0x10(%rsp),%xmm0
  182. paddd %xmm5,%xmm0
  183. movdqa %xmm0,0x10(%rsp)
  184. pxor %xmm0,%xmm13
  185. pshufb %xmm3,%xmm13
  186. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  187. movdqa 0x20(%rsp),%xmm0
  188. paddd %xmm6,%xmm0
  189. movdqa %xmm0,0x20(%rsp)
  190. pxor %xmm0,%xmm14
  191. pshufb %xmm3,%xmm14
  192. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  193. movdqa 0x30(%rsp),%xmm0
  194. paddd %xmm7,%xmm0
  195. movdqa %xmm0,0x30(%rsp)
  196. pxor %xmm0,%xmm15
  197. pshufb %xmm3,%xmm15
  198. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  199. paddd %xmm12,%xmm8
  200. pxor %xmm8,%xmm4
  201. movdqa %xmm4,%xmm0
  202. pslld $12,%xmm0
  203. psrld $20,%xmm4
  204. por %xmm0,%xmm4
  205. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  206. paddd %xmm13,%xmm9
  207. pxor %xmm9,%xmm5
  208. movdqa %xmm5,%xmm0
  209. pslld $12,%xmm0
  210. psrld $20,%xmm5
  211. por %xmm0,%xmm5
  212. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  213. paddd %xmm14,%xmm10
  214. pxor %xmm10,%xmm6
  215. movdqa %xmm6,%xmm0
  216. pslld $12,%xmm0
  217. psrld $20,%xmm6
  218. por %xmm0,%xmm6
  219. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  220. paddd %xmm15,%xmm11
  221. pxor %xmm11,%xmm7
  222. movdqa %xmm7,%xmm0
  223. pslld $12,%xmm0
  224. psrld $20,%xmm7
  225. por %xmm0,%xmm7
  226. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  227. movdqa 0x00(%rsp),%xmm0
  228. paddd %xmm4,%xmm0
  229. movdqa %xmm0,0x00(%rsp)
  230. pxor %xmm0,%xmm12
  231. pshufb %xmm2,%xmm12
  232. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  233. movdqa 0x10(%rsp),%xmm0
  234. paddd %xmm5,%xmm0
  235. movdqa %xmm0,0x10(%rsp)
  236. pxor %xmm0,%xmm13
  237. pshufb %xmm2,%xmm13
  238. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  239. movdqa 0x20(%rsp),%xmm0
  240. paddd %xmm6,%xmm0
  241. movdqa %xmm0,0x20(%rsp)
  242. pxor %xmm0,%xmm14
  243. pshufb %xmm2,%xmm14
  244. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  245. movdqa 0x30(%rsp),%xmm0
  246. paddd %xmm7,%xmm0
  247. movdqa %xmm0,0x30(%rsp)
  248. pxor %xmm0,%xmm15
  249. pshufb %xmm2,%xmm15
  250. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  251. paddd %xmm12,%xmm8
  252. pxor %xmm8,%xmm4
  253. movdqa %xmm4,%xmm0
  254. pslld $7,%xmm0
  255. psrld $25,%xmm4
  256. por %xmm0,%xmm4
  257. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  258. paddd %xmm13,%xmm9
  259. pxor %xmm9,%xmm5
  260. movdqa %xmm5,%xmm0
  261. pslld $7,%xmm0
  262. psrld $25,%xmm5
  263. por %xmm0,%xmm5
  264. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  265. paddd %xmm14,%xmm10
  266. pxor %xmm10,%xmm6
  267. movdqa %xmm6,%xmm0
  268. pslld $7,%xmm0
  269. psrld $25,%xmm6
  270. por %xmm0,%xmm6
  271. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  272. paddd %xmm15,%xmm11
  273. pxor %xmm11,%xmm7
  274. movdqa %xmm7,%xmm0
  275. pslld $7,%xmm0
  276. psrld $25,%xmm7
  277. por %xmm0,%xmm7
  278. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  279. movdqa 0x00(%rsp),%xmm0
  280. paddd %xmm5,%xmm0
  281. movdqa %xmm0,0x00(%rsp)
  282. pxor %xmm0,%xmm15
  283. pshufb %xmm3,%xmm15
  284. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  285. movdqa 0x10(%rsp),%xmm0
  286. paddd %xmm6,%xmm0
  287. movdqa %xmm0,0x10(%rsp)
  288. pxor %xmm0,%xmm12
  289. pshufb %xmm3,%xmm12
  290. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  291. movdqa 0x20(%rsp),%xmm0
  292. paddd %xmm7,%xmm0
  293. movdqa %xmm0,0x20(%rsp)
  294. pxor %xmm0,%xmm13
  295. pshufb %xmm3,%xmm13
  296. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  297. movdqa 0x30(%rsp),%xmm0
  298. paddd %xmm4,%xmm0
  299. movdqa %xmm0,0x30(%rsp)
  300. pxor %xmm0,%xmm14
  301. pshufb %xmm3,%xmm14
  302. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  303. paddd %xmm15,%xmm10
  304. pxor %xmm10,%xmm5
  305. movdqa %xmm5,%xmm0
  306. pslld $12,%xmm0
  307. psrld $20,%xmm5
  308. por %xmm0,%xmm5
  309. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  310. paddd %xmm12,%xmm11
  311. pxor %xmm11,%xmm6
  312. movdqa %xmm6,%xmm0
  313. pslld $12,%xmm0
  314. psrld $20,%xmm6
  315. por %xmm0,%xmm6
  316. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  317. paddd %xmm13,%xmm8
  318. pxor %xmm8,%xmm7
  319. movdqa %xmm7,%xmm0
  320. pslld $12,%xmm0
  321. psrld $20,%xmm7
  322. por %xmm0,%xmm7
  323. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  324. paddd %xmm14,%xmm9
  325. pxor %xmm9,%xmm4
  326. movdqa %xmm4,%xmm0
  327. pslld $12,%xmm0
  328. psrld $20,%xmm4
  329. por %xmm0,%xmm4
  330. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  331. movdqa 0x00(%rsp),%xmm0
  332. paddd %xmm5,%xmm0
  333. movdqa %xmm0,0x00(%rsp)
  334. pxor %xmm0,%xmm15
  335. pshufb %xmm2,%xmm15
  336. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  337. movdqa 0x10(%rsp),%xmm0
  338. paddd %xmm6,%xmm0
  339. movdqa %xmm0,0x10(%rsp)
  340. pxor %xmm0,%xmm12
  341. pshufb %xmm2,%xmm12
  342. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  343. movdqa 0x20(%rsp),%xmm0
  344. paddd %xmm7,%xmm0
  345. movdqa %xmm0,0x20(%rsp)
  346. pxor %xmm0,%xmm13
  347. pshufb %xmm2,%xmm13
  348. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  349. movdqa 0x30(%rsp),%xmm0
  350. paddd %xmm4,%xmm0
  351. movdqa %xmm0,0x30(%rsp)
  352. pxor %xmm0,%xmm14
  353. pshufb %xmm2,%xmm14
  354. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  355. paddd %xmm15,%xmm10
  356. pxor %xmm10,%xmm5
  357. movdqa %xmm5,%xmm0
  358. pslld $7,%xmm0
  359. psrld $25,%xmm5
  360. por %xmm0,%xmm5
  361. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  362. paddd %xmm12,%xmm11
  363. pxor %xmm11,%xmm6
  364. movdqa %xmm6,%xmm0
  365. pslld $7,%xmm0
  366. psrld $25,%xmm6
  367. por %xmm0,%xmm6
  368. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  369. paddd %xmm13,%xmm8
  370. pxor %xmm8,%xmm7
  371. movdqa %xmm7,%xmm0
  372. pslld $7,%xmm0
  373. psrld $25,%xmm7
  374. por %xmm0,%xmm7
  375. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  376. paddd %xmm14,%xmm9
  377. pxor %xmm9,%xmm4
  378. movdqa %xmm4,%xmm0
  379. pslld $7,%xmm0
  380. psrld $25,%xmm4
  381. por %xmm0,%xmm4
  382. dec %ecx
  383. jnz .Ldoubleround4
  384. # x0[0-3] += s0[0]
  385. # x1[0-3] += s0[1]
  386. movq 0x00(%rdi),%xmm3
  387. pshufd $0x00,%xmm3,%xmm2
  388. pshufd $0x55,%xmm3,%xmm3
  389. paddd 0x00(%rsp),%xmm2
  390. movdqa %xmm2,0x00(%rsp)
  391. paddd 0x10(%rsp),%xmm3
  392. movdqa %xmm3,0x10(%rsp)
  393. # x2[0-3] += s0[2]
  394. # x3[0-3] += s0[3]
  395. movq 0x08(%rdi),%xmm3
  396. pshufd $0x00,%xmm3,%xmm2
  397. pshufd $0x55,%xmm3,%xmm3
  398. paddd 0x20(%rsp),%xmm2
  399. movdqa %xmm2,0x20(%rsp)
  400. paddd 0x30(%rsp),%xmm3
  401. movdqa %xmm3,0x30(%rsp)
  402. # x4[0-3] += s1[0]
  403. # x5[0-3] += s1[1]
  404. movq 0x10(%rdi),%xmm3
  405. pshufd $0x00,%xmm3,%xmm2
  406. pshufd $0x55,%xmm3,%xmm3
  407. paddd %xmm2,%xmm4
  408. paddd %xmm3,%xmm5
  409. # x6[0-3] += s1[2]
  410. # x7[0-3] += s1[3]
  411. movq 0x18(%rdi),%xmm3
  412. pshufd $0x00,%xmm3,%xmm2
  413. pshufd $0x55,%xmm3,%xmm3
  414. paddd %xmm2,%xmm6
  415. paddd %xmm3,%xmm7
  416. # x8[0-3] += s2[0]
  417. # x9[0-3] += s2[1]
  418. movq 0x20(%rdi),%xmm3
  419. pshufd $0x00,%xmm3,%xmm2
  420. pshufd $0x55,%xmm3,%xmm3
  421. paddd %xmm2,%xmm8
  422. paddd %xmm3,%xmm9
  423. # x10[0-3] += s2[2]
  424. # x11[0-3] += s2[3]
  425. movq 0x28(%rdi),%xmm3
  426. pshufd $0x00,%xmm3,%xmm2
  427. pshufd $0x55,%xmm3,%xmm3
  428. paddd %xmm2,%xmm10
  429. paddd %xmm3,%xmm11
  430. # x12[0-3] += s3[0]
  431. # x13[0-3] += s3[1]
  432. movq 0x30(%rdi),%xmm3
  433. pshufd $0x00,%xmm3,%xmm2
  434. pshufd $0x55,%xmm3,%xmm3
  435. paddd %xmm2,%xmm12
  436. paddd %xmm3,%xmm13
  437. # x14[0-3] += s3[2]
  438. # x15[0-3] += s3[3]
  439. movq 0x38(%rdi),%xmm3
  440. pshufd $0x00,%xmm3,%xmm2
  441. pshufd $0x55,%xmm3,%xmm3
  442. paddd %xmm2,%xmm14
  443. paddd %xmm3,%xmm15
  444. # x12 += counter values 0-3
  445. paddd %xmm1,%xmm12
  446. # interleave 32-bit words in state n, n+1
  447. movdqa 0x00(%rsp),%xmm0
  448. movdqa 0x10(%rsp),%xmm1
  449. movdqa %xmm0,%xmm2
  450. punpckldq %xmm1,%xmm2
  451. punpckhdq %xmm1,%xmm0
  452. movdqa %xmm2,0x00(%rsp)
  453. movdqa %xmm0,0x10(%rsp)
  454. movdqa 0x20(%rsp),%xmm0
  455. movdqa 0x30(%rsp),%xmm1
  456. movdqa %xmm0,%xmm2
  457. punpckldq %xmm1,%xmm2
  458. punpckhdq %xmm1,%xmm0
  459. movdqa %xmm2,0x20(%rsp)
  460. movdqa %xmm0,0x30(%rsp)
  461. movdqa %xmm4,%xmm0
  462. punpckldq %xmm5,%xmm4
  463. punpckhdq %xmm5,%xmm0
  464. movdqa %xmm0,%xmm5
  465. movdqa %xmm6,%xmm0
  466. punpckldq %xmm7,%xmm6
  467. punpckhdq %xmm7,%xmm0
  468. movdqa %xmm0,%xmm7
  469. movdqa %xmm8,%xmm0
  470. punpckldq %xmm9,%xmm8
  471. punpckhdq %xmm9,%xmm0
  472. movdqa %xmm0,%xmm9
  473. movdqa %xmm10,%xmm0
  474. punpckldq %xmm11,%xmm10
  475. punpckhdq %xmm11,%xmm0
  476. movdqa %xmm0,%xmm11
  477. movdqa %xmm12,%xmm0
  478. punpckldq %xmm13,%xmm12
  479. punpckhdq %xmm13,%xmm0
  480. movdqa %xmm0,%xmm13
  481. movdqa %xmm14,%xmm0
  482. punpckldq %xmm15,%xmm14
  483. punpckhdq %xmm15,%xmm0
  484. movdqa %xmm0,%xmm15
  485. # interleave 64-bit words in state n, n+2
  486. movdqa 0x00(%rsp),%xmm0
  487. movdqa 0x20(%rsp),%xmm1
  488. movdqa %xmm0,%xmm2
  489. punpcklqdq %xmm1,%xmm2
  490. punpckhqdq %xmm1,%xmm0
  491. movdqa %xmm2,0x00(%rsp)
  492. movdqa %xmm0,0x20(%rsp)
  493. movdqa 0x10(%rsp),%xmm0
  494. movdqa 0x30(%rsp),%xmm1
  495. movdqa %xmm0,%xmm2
  496. punpcklqdq %xmm1,%xmm2
  497. punpckhqdq %xmm1,%xmm0
  498. movdqa %xmm2,0x10(%rsp)
  499. movdqa %xmm0,0x30(%rsp)
  500. movdqa %xmm4,%xmm0
  501. punpcklqdq %xmm6,%xmm4
  502. punpckhqdq %xmm6,%xmm0
  503. movdqa %xmm0,%xmm6
  504. movdqa %xmm5,%xmm0
  505. punpcklqdq %xmm7,%xmm5
  506. punpckhqdq %xmm7,%xmm0
  507. movdqa %xmm0,%xmm7
  508. movdqa %xmm8,%xmm0
  509. punpcklqdq %xmm10,%xmm8
  510. punpckhqdq %xmm10,%xmm0
  511. movdqa %xmm0,%xmm10
  512. movdqa %xmm9,%xmm0
  513. punpcklqdq %xmm11,%xmm9
  514. punpckhqdq %xmm11,%xmm0
  515. movdqa %xmm0,%xmm11
  516. movdqa %xmm12,%xmm0
  517. punpcklqdq %xmm14,%xmm12
  518. punpckhqdq %xmm14,%xmm0
  519. movdqa %xmm0,%xmm14
  520. movdqa %xmm13,%xmm0
  521. punpcklqdq %xmm15,%xmm13
  522. punpckhqdq %xmm15,%xmm0
  523. movdqa %xmm0,%xmm15
  524. # xor with corresponding input, write to output
  525. movdqa 0x00(%rsp),%xmm0
  526. movdqu 0x00(%rdx),%xmm1
  527. pxor %xmm1,%xmm0
  528. movdqu %xmm0,0x00(%rsi)
  529. movdqa 0x10(%rsp),%xmm0
  530. movdqu 0x80(%rdx),%xmm1
  531. pxor %xmm1,%xmm0
  532. movdqu %xmm0,0x80(%rsi)
  533. movdqa 0x20(%rsp),%xmm0
  534. movdqu 0x40(%rdx),%xmm1
  535. pxor %xmm1,%xmm0
  536. movdqu %xmm0,0x40(%rsi)
  537. movdqa 0x30(%rsp),%xmm0
  538. movdqu 0xc0(%rdx),%xmm1
  539. pxor %xmm1,%xmm0
  540. movdqu %xmm0,0xc0(%rsi)
  541. movdqu 0x10(%rdx),%xmm1
  542. pxor %xmm1,%xmm4
  543. movdqu %xmm4,0x10(%rsi)
  544. movdqu 0x90(%rdx),%xmm1
  545. pxor %xmm1,%xmm5
  546. movdqu %xmm5,0x90(%rsi)
  547. movdqu 0x50(%rdx),%xmm1
  548. pxor %xmm1,%xmm6
  549. movdqu %xmm6,0x50(%rsi)
  550. movdqu 0xd0(%rdx),%xmm1
  551. pxor %xmm1,%xmm7
  552. movdqu %xmm7,0xd0(%rsi)
  553. movdqu 0x20(%rdx),%xmm1
  554. pxor %xmm1,%xmm8
  555. movdqu %xmm8,0x20(%rsi)
  556. movdqu 0xa0(%rdx),%xmm1
  557. pxor %xmm1,%xmm9
  558. movdqu %xmm9,0xa0(%rsi)
  559. movdqu 0x60(%rdx),%xmm1
  560. pxor %xmm1,%xmm10
  561. movdqu %xmm10,0x60(%rsi)
  562. movdqu 0xe0(%rdx),%xmm1
  563. pxor %xmm1,%xmm11
  564. movdqu %xmm11,0xe0(%rsi)
  565. movdqu 0x30(%rdx),%xmm1
  566. pxor %xmm1,%xmm12
  567. movdqu %xmm12,0x30(%rsi)
  568. movdqu 0xb0(%rdx),%xmm1
  569. pxor %xmm1,%xmm13
  570. movdqu %xmm13,0xb0(%rsi)
  571. movdqu 0x70(%rdx),%xmm1
  572. pxor %xmm1,%xmm14
  573. movdqu %xmm14,0x70(%rsi)
  574. movdqu 0xf0(%rdx),%xmm1
  575. pxor %xmm1,%xmm15
  576. movdqu %xmm15,0xf0(%rsi)
  577. mov %r11,%rsp
  578. ret
  579. ENDPROC(chacha20_4block_xor_ssse3)