salsa20-x86_64-asm_64.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. #include <linux/linkage.h>
  2. # enter salsa20_encrypt_bytes
  3. ENTRY(salsa20_encrypt_bytes)
  4. mov %rsp,%r11
  5. and $31,%r11
  6. add $256,%r11
  7. sub %r11,%rsp
  8. # x = arg1
  9. mov %rdi,%r8
  10. # m = arg2
  11. mov %rsi,%rsi
  12. # out = arg3
  13. mov %rdx,%rdi
  14. # bytes = arg4
  15. mov %rcx,%rdx
  16. # unsigned>? bytes - 0
  17. cmp $0,%rdx
  18. # comment:fp stack unchanged by jump
  19. # goto done if !unsigned>
  20. jbe ._done
  21. # comment:fp stack unchanged by fallthrough
  22. # start:
  23. ._start:
  24. # r11_stack = r11
  25. movq %r11,0(%rsp)
  26. # r12_stack = r12
  27. movq %r12,8(%rsp)
  28. # r13_stack = r13
  29. movq %r13,16(%rsp)
  30. # r14_stack = r14
  31. movq %r14,24(%rsp)
  32. # r15_stack = r15
  33. movq %r15,32(%rsp)
  34. # rbx_stack = rbx
  35. movq %rbx,40(%rsp)
  36. # rbp_stack = rbp
  37. movq %rbp,48(%rsp)
  38. # in0 = *(uint64 *) (x + 0)
  39. movq 0(%r8),%rcx
  40. # in2 = *(uint64 *) (x + 8)
  41. movq 8(%r8),%r9
  42. # in4 = *(uint64 *) (x + 16)
  43. movq 16(%r8),%rax
  44. # in6 = *(uint64 *) (x + 24)
  45. movq 24(%r8),%r10
  46. # in8 = *(uint64 *) (x + 32)
  47. movq 32(%r8),%r11
  48. # in10 = *(uint64 *) (x + 40)
  49. movq 40(%r8),%r12
  50. # in12 = *(uint64 *) (x + 48)
  51. movq 48(%r8),%r13
  52. # in14 = *(uint64 *) (x + 56)
  53. movq 56(%r8),%r14
  54. # j0 = in0
  55. movq %rcx,56(%rsp)
  56. # j2 = in2
  57. movq %r9,64(%rsp)
  58. # j4 = in4
  59. movq %rax,72(%rsp)
  60. # j6 = in6
  61. movq %r10,80(%rsp)
  62. # j8 = in8
  63. movq %r11,88(%rsp)
  64. # j10 = in10
  65. movq %r12,96(%rsp)
  66. # j12 = in12
  67. movq %r13,104(%rsp)
  68. # j14 = in14
  69. movq %r14,112(%rsp)
  70. # x_backup = x
  71. movq %r8,120(%rsp)
  72. # bytesatleast1:
  73. ._bytesatleast1:
  74. # unsigned<? bytes - 64
  75. cmp $64,%rdx
  76. # comment:fp stack unchanged by jump
  77. # goto nocopy if !unsigned<
  78. jae ._nocopy
  79. # ctarget = out
  80. movq %rdi,128(%rsp)
  81. # out = &tmp
  82. leaq 192(%rsp),%rdi
  83. # i = bytes
  84. mov %rdx,%rcx
  85. # while (i) { *out++ = *m++; --i }
  86. rep movsb
  87. # out = &tmp
  88. leaq 192(%rsp),%rdi
  89. # m = &tmp
  90. leaq 192(%rsp),%rsi
  91. # comment:fp stack unchanged by fallthrough
  92. # nocopy:
  93. ._nocopy:
  94. # out_backup = out
  95. movq %rdi,136(%rsp)
  96. # m_backup = m
  97. movq %rsi,144(%rsp)
  98. # bytes_backup = bytes
  99. movq %rdx,152(%rsp)
  100. # x1 = j0
  101. movq 56(%rsp),%rdi
  102. # x0 = x1
  103. mov %rdi,%rdx
  104. # (uint64) x1 >>= 32
  105. shr $32,%rdi
  106. # x3 = j2
  107. movq 64(%rsp),%rsi
  108. # x2 = x3
  109. mov %rsi,%rcx
  110. # (uint64) x3 >>= 32
  111. shr $32,%rsi
  112. # x5 = j4
  113. movq 72(%rsp),%r8
  114. # x4 = x5
  115. mov %r8,%r9
  116. # (uint64) x5 >>= 32
  117. shr $32,%r8
  118. # x5_stack = x5
  119. movq %r8,160(%rsp)
  120. # x7 = j6
  121. movq 80(%rsp),%r8
  122. # x6 = x7
  123. mov %r8,%rax
  124. # (uint64) x7 >>= 32
  125. shr $32,%r8
  126. # x9 = j8
  127. movq 88(%rsp),%r10
  128. # x8 = x9
  129. mov %r10,%r11
  130. # (uint64) x9 >>= 32
  131. shr $32,%r10
  132. # x11 = j10
  133. movq 96(%rsp),%r12
  134. # x10 = x11
  135. mov %r12,%r13
  136. # x10_stack = x10
  137. movq %r13,168(%rsp)
  138. # (uint64) x11 >>= 32
  139. shr $32,%r12
  140. # x13 = j12
  141. movq 104(%rsp),%r13
  142. # x12 = x13
  143. mov %r13,%r14
  144. # (uint64) x13 >>= 32
  145. shr $32,%r13
  146. # x15 = j14
  147. movq 112(%rsp),%r15
  148. # x14 = x15
  149. mov %r15,%rbx
  150. # (uint64) x15 >>= 32
  151. shr $32,%r15
  152. # x15_stack = x15
  153. movq %r15,176(%rsp)
  154. # i = 20
  155. mov $20,%r15
  156. # mainloop:
  157. ._mainloop:
  158. # i_backup = i
  159. movq %r15,184(%rsp)
  160. # x5 = x5_stack
  161. movq 160(%rsp),%r15
  162. # a = x12 + x0
  163. lea (%r14,%rdx),%rbp
  164. # (uint32) a <<<= 7
  165. rol $7,%ebp
  166. # x4 ^= a
  167. xor %rbp,%r9
  168. # b = x1 + x5
  169. lea (%rdi,%r15),%rbp
  170. # (uint32) b <<<= 7
  171. rol $7,%ebp
  172. # x9 ^= b
  173. xor %rbp,%r10
  174. # a = x0 + x4
  175. lea (%rdx,%r9),%rbp
  176. # (uint32) a <<<= 9
  177. rol $9,%ebp
  178. # x8 ^= a
  179. xor %rbp,%r11
  180. # b = x5 + x9
  181. lea (%r15,%r10),%rbp
  182. # (uint32) b <<<= 9
  183. rol $9,%ebp
  184. # x13 ^= b
  185. xor %rbp,%r13
  186. # a = x4 + x8
  187. lea (%r9,%r11),%rbp
  188. # (uint32) a <<<= 13
  189. rol $13,%ebp
  190. # x12 ^= a
  191. xor %rbp,%r14
  192. # b = x9 + x13
  193. lea (%r10,%r13),%rbp
  194. # (uint32) b <<<= 13
  195. rol $13,%ebp
  196. # x1 ^= b
  197. xor %rbp,%rdi
  198. # a = x8 + x12
  199. lea (%r11,%r14),%rbp
  200. # (uint32) a <<<= 18
  201. rol $18,%ebp
  202. # x0 ^= a
  203. xor %rbp,%rdx
  204. # b = x13 + x1
  205. lea (%r13,%rdi),%rbp
  206. # (uint32) b <<<= 18
  207. rol $18,%ebp
  208. # x5 ^= b
  209. xor %rbp,%r15
  210. # x10 = x10_stack
  211. movq 168(%rsp),%rbp
  212. # x5_stack = x5
  213. movq %r15,160(%rsp)
  214. # c = x6 + x10
  215. lea (%rax,%rbp),%r15
  216. # (uint32) c <<<= 7
  217. rol $7,%r15d
  218. # x14 ^= c
  219. xor %r15,%rbx
  220. # c = x10 + x14
  221. lea (%rbp,%rbx),%r15
  222. # (uint32) c <<<= 9
  223. rol $9,%r15d
  224. # x2 ^= c
  225. xor %r15,%rcx
  226. # c = x14 + x2
  227. lea (%rbx,%rcx),%r15
  228. # (uint32) c <<<= 13
  229. rol $13,%r15d
  230. # x6 ^= c
  231. xor %r15,%rax
  232. # c = x2 + x6
  233. lea (%rcx,%rax),%r15
  234. # (uint32) c <<<= 18
  235. rol $18,%r15d
  236. # x10 ^= c
  237. xor %r15,%rbp
  238. # x15 = x15_stack
  239. movq 176(%rsp),%r15
  240. # x10_stack = x10
  241. movq %rbp,168(%rsp)
  242. # d = x11 + x15
  243. lea (%r12,%r15),%rbp
  244. # (uint32) d <<<= 7
  245. rol $7,%ebp
  246. # x3 ^= d
  247. xor %rbp,%rsi
  248. # d = x15 + x3
  249. lea (%r15,%rsi),%rbp
  250. # (uint32) d <<<= 9
  251. rol $9,%ebp
  252. # x7 ^= d
  253. xor %rbp,%r8
  254. # d = x3 + x7
  255. lea (%rsi,%r8),%rbp
  256. # (uint32) d <<<= 13
  257. rol $13,%ebp
  258. # x11 ^= d
  259. xor %rbp,%r12
  260. # d = x7 + x11
  261. lea (%r8,%r12),%rbp
  262. # (uint32) d <<<= 18
  263. rol $18,%ebp
  264. # x15 ^= d
  265. xor %rbp,%r15
  266. # x15_stack = x15
  267. movq %r15,176(%rsp)
  268. # x5 = x5_stack
  269. movq 160(%rsp),%r15
  270. # a = x3 + x0
  271. lea (%rsi,%rdx),%rbp
  272. # (uint32) a <<<= 7
  273. rol $7,%ebp
  274. # x1 ^= a
  275. xor %rbp,%rdi
  276. # b = x4 + x5
  277. lea (%r9,%r15),%rbp
  278. # (uint32) b <<<= 7
  279. rol $7,%ebp
  280. # x6 ^= b
  281. xor %rbp,%rax
  282. # a = x0 + x1
  283. lea (%rdx,%rdi),%rbp
  284. # (uint32) a <<<= 9
  285. rol $9,%ebp
  286. # x2 ^= a
  287. xor %rbp,%rcx
  288. # b = x5 + x6
  289. lea (%r15,%rax),%rbp
  290. # (uint32) b <<<= 9
  291. rol $9,%ebp
  292. # x7 ^= b
  293. xor %rbp,%r8
  294. # a = x1 + x2
  295. lea (%rdi,%rcx),%rbp
  296. # (uint32) a <<<= 13
  297. rol $13,%ebp
  298. # x3 ^= a
  299. xor %rbp,%rsi
  300. # b = x6 + x7
  301. lea (%rax,%r8),%rbp
  302. # (uint32) b <<<= 13
  303. rol $13,%ebp
  304. # x4 ^= b
  305. xor %rbp,%r9
  306. # a = x2 + x3
  307. lea (%rcx,%rsi),%rbp
  308. # (uint32) a <<<= 18
  309. rol $18,%ebp
  310. # x0 ^= a
  311. xor %rbp,%rdx
  312. # b = x7 + x4
  313. lea (%r8,%r9),%rbp
  314. # (uint32) b <<<= 18
  315. rol $18,%ebp
  316. # x5 ^= b
  317. xor %rbp,%r15
  318. # x10 = x10_stack
  319. movq 168(%rsp),%rbp
  320. # x5_stack = x5
  321. movq %r15,160(%rsp)
  322. # c = x9 + x10
  323. lea (%r10,%rbp),%r15
  324. # (uint32) c <<<= 7
  325. rol $7,%r15d
  326. # x11 ^= c
  327. xor %r15,%r12
  328. # c = x10 + x11
  329. lea (%rbp,%r12),%r15
  330. # (uint32) c <<<= 9
  331. rol $9,%r15d
  332. # x8 ^= c
  333. xor %r15,%r11
  334. # c = x11 + x8
  335. lea (%r12,%r11),%r15
  336. # (uint32) c <<<= 13
  337. rol $13,%r15d
  338. # x9 ^= c
  339. xor %r15,%r10
  340. # c = x8 + x9
  341. lea (%r11,%r10),%r15
  342. # (uint32) c <<<= 18
  343. rol $18,%r15d
  344. # x10 ^= c
  345. xor %r15,%rbp
  346. # x15 = x15_stack
  347. movq 176(%rsp),%r15
  348. # x10_stack = x10
  349. movq %rbp,168(%rsp)
  350. # d = x14 + x15
  351. lea (%rbx,%r15),%rbp
  352. # (uint32) d <<<= 7
  353. rol $7,%ebp
  354. # x12 ^= d
  355. xor %rbp,%r14
  356. # d = x15 + x12
  357. lea (%r15,%r14),%rbp
  358. # (uint32) d <<<= 9
  359. rol $9,%ebp
  360. # x13 ^= d
  361. xor %rbp,%r13
  362. # d = x12 + x13
  363. lea (%r14,%r13),%rbp
  364. # (uint32) d <<<= 13
  365. rol $13,%ebp
  366. # x14 ^= d
  367. xor %rbp,%rbx
  368. # d = x13 + x14
  369. lea (%r13,%rbx),%rbp
  370. # (uint32) d <<<= 18
  371. rol $18,%ebp
  372. # x15 ^= d
  373. xor %rbp,%r15
  374. # x15_stack = x15
  375. movq %r15,176(%rsp)
  376. # x5 = x5_stack
  377. movq 160(%rsp),%r15
  378. # a = x12 + x0
  379. lea (%r14,%rdx),%rbp
  380. # (uint32) a <<<= 7
  381. rol $7,%ebp
  382. # x4 ^= a
  383. xor %rbp,%r9
  384. # b = x1 + x5
  385. lea (%rdi,%r15),%rbp
  386. # (uint32) b <<<= 7
  387. rol $7,%ebp
  388. # x9 ^= b
  389. xor %rbp,%r10
  390. # a = x0 + x4
  391. lea (%rdx,%r9),%rbp
  392. # (uint32) a <<<= 9
  393. rol $9,%ebp
  394. # x8 ^= a
  395. xor %rbp,%r11
  396. # b = x5 + x9
  397. lea (%r15,%r10),%rbp
  398. # (uint32) b <<<= 9
  399. rol $9,%ebp
  400. # x13 ^= b
  401. xor %rbp,%r13
  402. # a = x4 + x8
  403. lea (%r9,%r11),%rbp
  404. # (uint32) a <<<= 13
  405. rol $13,%ebp
  406. # x12 ^= a
  407. xor %rbp,%r14
  408. # b = x9 + x13
  409. lea (%r10,%r13),%rbp
  410. # (uint32) b <<<= 13
  411. rol $13,%ebp
  412. # x1 ^= b
  413. xor %rbp,%rdi
  414. # a = x8 + x12
  415. lea (%r11,%r14),%rbp
  416. # (uint32) a <<<= 18
  417. rol $18,%ebp
  418. # x0 ^= a
  419. xor %rbp,%rdx
  420. # b = x13 + x1
  421. lea (%r13,%rdi),%rbp
  422. # (uint32) b <<<= 18
  423. rol $18,%ebp
  424. # x5 ^= b
  425. xor %rbp,%r15
  426. # x10 = x10_stack
  427. movq 168(%rsp),%rbp
  428. # x5_stack = x5
  429. movq %r15,160(%rsp)
  430. # c = x6 + x10
  431. lea (%rax,%rbp),%r15
  432. # (uint32) c <<<= 7
  433. rol $7,%r15d
  434. # x14 ^= c
  435. xor %r15,%rbx
  436. # c = x10 + x14
  437. lea (%rbp,%rbx),%r15
  438. # (uint32) c <<<= 9
  439. rol $9,%r15d
  440. # x2 ^= c
  441. xor %r15,%rcx
  442. # c = x14 + x2
  443. lea (%rbx,%rcx),%r15
  444. # (uint32) c <<<= 13
  445. rol $13,%r15d
  446. # x6 ^= c
  447. xor %r15,%rax
  448. # c = x2 + x6
  449. lea (%rcx,%rax),%r15
  450. # (uint32) c <<<= 18
  451. rol $18,%r15d
  452. # x10 ^= c
  453. xor %r15,%rbp
  454. # x15 = x15_stack
  455. movq 176(%rsp),%r15
  456. # x10_stack = x10
  457. movq %rbp,168(%rsp)
  458. # d = x11 + x15
  459. lea (%r12,%r15),%rbp
  460. # (uint32) d <<<= 7
  461. rol $7,%ebp
  462. # x3 ^= d
  463. xor %rbp,%rsi
  464. # d = x15 + x3
  465. lea (%r15,%rsi),%rbp
  466. # (uint32) d <<<= 9
  467. rol $9,%ebp
  468. # x7 ^= d
  469. xor %rbp,%r8
  470. # d = x3 + x7
  471. lea (%rsi,%r8),%rbp
  472. # (uint32) d <<<= 13
  473. rol $13,%ebp
  474. # x11 ^= d
  475. xor %rbp,%r12
  476. # d = x7 + x11
  477. lea (%r8,%r12),%rbp
  478. # (uint32) d <<<= 18
  479. rol $18,%ebp
  480. # x15 ^= d
  481. xor %rbp,%r15
  482. # x15_stack = x15
  483. movq %r15,176(%rsp)
  484. # x5 = x5_stack
  485. movq 160(%rsp),%r15
  486. # a = x3 + x0
  487. lea (%rsi,%rdx),%rbp
  488. # (uint32) a <<<= 7
  489. rol $7,%ebp
  490. # x1 ^= a
  491. xor %rbp,%rdi
  492. # b = x4 + x5
  493. lea (%r9,%r15),%rbp
  494. # (uint32) b <<<= 7
  495. rol $7,%ebp
  496. # x6 ^= b
  497. xor %rbp,%rax
  498. # a = x0 + x1
  499. lea (%rdx,%rdi),%rbp
  500. # (uint32) a <<<= 9
  501. rol $9,%ebp
  502. # x2 ^= a
  503. xor %rbp,%rcx
  504. # b = x5 + x6
  505. lea (%r15,%rax),%rbp
  506. # (uint32) b <<<= 9
  507. rol $9,%ebp
  508. # x7 ^= b
  509. xor %rbp,%r8
  510. # a = x1 + x2
  511. lea (%rdi,%rcx),%rbp
  512. # (uint32) a <<<= 13
  513. rol $13,%ebp
  514. # x3 ^= a
  515. xor %rbp,%rsi
  516. # b = x6 + x7
  517. lea (%rax,%r8),%rbp
  518. # (uint32) b <<<= 13
  519. rol $13,%ebp
  520. # x4 ^= b
  521. xor %rbp,%r9
  522. # a = x2 + x3
  523. lea (%rcx,%rsi),%rbp
  524. # (uint32) a <<<= 18
  525. rol $18,%ebp
  526. # x0 ^= a
  527. xor %rbp,%rdx
  528. # b = x7 + x4
  529. lea (%r8,%r9),%rbp
  530. # (uint32) b <<<= 18
  531. rol $18,%ebp
  532. # x5 ^= b
  533. xor %rbp,%r15
  534. # x10 = x10_stack
  535. movq 168(%rsp),%rbp
  536. # x5_stack = x5
  537. movq %r15,160(%rsp)
  538. # c = x9 + x10
  539. lea (%r10,%rbp),%r15
  540. # (uint32) c <<<= 7
  541. rol $7,%r15d
  542. # x11 ^= c
  543. xor %r15,%r12
  544. # c = x10 + x11
  545. lea (%rbp,%r12),%r15
  546. # (uint32) c <<<= 9
  547. rol $9,%r15d
  548. # x8 ^= c
  549. xor %r15,%r11
  550. # c = x11 + x8
  551. lea (%r12,%r11),%r15
  552. # (uint32) c <<<= 13
  553. rol $13,%r15d
  554. # x9 ^= c
  555. xor %r15,%r10
  556. # c = x8 + x9
  557. lea (%r11,%r10),%r15
  558. # (uint32) c <<<= 18
  559. rol $18,%r15d
  560. # x10 ^= c
  561. xor %r15,%rbp
  562. # x15 = x15_stack
  563. movq 176(%rsp),%r15
  564. # x10_stack = x10
  565. movq %rbp,168(%rsp)
  566. # d = x14 + x15
  567. lea (%rbx,%r15),%rbp
  568. # (uint32) d <<<= 7
  569. rol $7,%ebp
  570. # x12 ^= d
  571. xor %rbp,%r14
  572. # d = x15 + x12
  573. lea (%r15,%r14),%rbp
  574. # (uint32) d <<<= 9
  575. rol $9,%ebp
  576. # x13 ^= d
  577. xor %rbp,%r13
  578. # d = x12 + x13
  579. lea (%r14,%r13),%rbp
  580. # (uint32) d <<<= 13
  581. rol $13,%ebp
  582. # x14 ^= d
  583. xor %rbp,%rbx
  584. # d = x13 + x14
  585. lea (%r13,%rbx),%rbp
  586. # (uint32) d <<<= 18
  587. rol $18,%ebp
  588. # x15 ^= d
  589. xor %rbp,%r15
  590. # x15_stack = x15
  591. movq %r15,176(%rsp)
  592. # i = i_backup
  593. movq 184(%rsp),%r15
  594. # unsigned>? i -= 4
  595. sub $4,%r15
  596. # comment:fp stack unchanged by jump
  597. # goto mainloop if unsigned>
  598. ja ._mainloop
  599. # (uint32) x2 += j2
  600. addl 64(%rsp),%ecx
  601. # x3 <<= 32
  602. shl $32,%rsi
  603. # x3 += j2
  604. addq 64(%rsp),%rsi
  605. # (uint64) x3 >>= 32
  606. shr $32,%rsi
  607. # x3 <<= 32
  608. shl $32,%rsi
  609. # x2 += x3
  610. add %rsi,%rcx
  611. # (uint32) x6 += j6
  612. addl 80(%rsp),%eax
  613. # x7 <<= 32
  614. shl $32,%r8
  615. # x7 += j6
  616. addq 80(%rsp),%r8
  617. # (uint64) x7 >>= 32
  618. shr $32,%r8
  619. # x7 <<= 32
  620. shl $32,%r8
  621. # x6 += x7
  622. add %r8,%rax
  623. # (uint32) x8 += j8
  624. addl 88(%rsp),%r11d
  625. # x9 <<= 32
  626. shl $32,%r10
  627. # x9 += j8
  628. addq 88(%rsp),%r10
  629. # (uint64) x9 >>= 32
  630. shr $32,%r10
  631. # x9 <<= 32
  632. shl $32,%r10
  633. # x8 += x9
  634. add %r10,%r11
  635. # (uint32) x12 += j12
  636. addl 104(%rsp),%r14d
  637. # x13 <<= 32
  638. shl $32,%r13
  639. # x13 += j12
  640. addq 104(%rsp),%r13
  641. # (uint64) x13 >>= 32
  642. shr $32,%r13
  643. # x13 <<= 32
  644. shl $32,%r13
  645. # x12 += x13
  646. add %r13,%r14
  647. # (uint32) x0 += j0
  648. addl 56(%rsp),%edx
  649. # x1 <<= 32
  650. shl $32,%rdi
  651. # x1 += j0
  652. addq 56(%rsp),%rdi
  653. # (uint64) x1 >>= 32
  654. shr $32,%rdi
  655. # x1 <<= 32
  656. shl $32,%rdi
  657. # x0 += x1
  658. add %rdi,%rdx
  659. # x5 = x5_stack
  660. movq 160(%rsp),%rdi
  661. # (uint32) x4 += j4
  662. addl 72(%rsp),%r9d
  663. # x5 <<= 32
  664. shl $32,%rdi
  665. # x5 += j4
  666. addq 72(%rsp),%rdi
  667. # (uint64) x5 >>= 32
  668. shr $32,%rdi
  669. # x5 <<= 32
  670. shl $32,%rdi
  671. # x4 += x5
  672. add %rdi,%r9
  673. # x10 = x10_stack
  674. movq 168(%rsp),%r8
  675. # (uint32) x10 += j10
  676. addl 96(%rsp),%r8d
  677. # x11 <<= 32
  678. shl $32,%r12
  679. # x11 += j10
  680. addq 96(%rsp),%r12
  681. # (uint64) x11 >>= 32
  682. shr $32,%r12
  683. # x11 <<= 32
  684. shl $32,%r12
  685. # x10 += x11
  686. add %r12,%r8
  687. # x15 = x15_stack
  688. movq 176(%rsp),%rdi
  689. # (uint32) x14 += j14
  690. addl 112(%rsp),%ebx
  691. # x15 <<= 32
  692. shl $32,%rdi
  693. # x15 += j14
  694. addq 112(%rsp),%rdi
  695. # (uint64) x15 >>= 32
  696. shr $32,%rdi
  697. # x15 <<= 32
  698. shl $32,%rdi
  699. # x14 += x15
  700. add %rdi,%rbx
  701. # out = out_backup
  702. movq 136(%rsp),%rdi
  703. # m = m_backup
  704. movq 144(%rsp),%rsi
  705. # x0 ^= *(uint64 *) (m + 0)
  706. xorq 0(%rsi),%rdx
  707. # *(uint64 *) (out + 0) = x0
  708. movq %rdx,0(%rdi)
  709. # x2 ^= *(uint64 *) (m + 8)
  710. xorq 8(%rsi),%rcx
  711. # *(uint64 *) (out + 8) = x2
  712. movq %rcx,8(%rdi)
  713. # x4 ^= *(uint64 *) (m + 16)
  714. xorq 16(%rsi),%r9
  715. # *(uint64 *) (out + 16) = x4
  716. movq %r9,16(%rdi)
  717. # x6 ^= *(uint64 *) (m + 24)
  718. xorq 24(%rsi),%rax
  719. # *(uint64 *) (out + 24) = x6
  720. movq %rax,24(%rdi)
  721. # x8 ^= *(uint64 *) (m + 32)
  722. xorq 32(%rsi),%r11
  723. # *(uint64 *) (out + 32) = x8
  724. movq %r11,32(%rdi)
  725. # x10 ^= *(uint64 *) (m + 40)
  726. xorq 40(%rsi),%r8
  727. # *(uint64 *) (out + 40) = x10
  728. movq %r8,40(%rdi)
  729. # x12 ^= *(uint64 *) (m + 48)
  730. xorq 48(%rsi),%r14
  731. # *(uint64 *) (out + 48) = x12
  732. movq %r14,48(%rdi)
  733. # x14 ^= *(uint64 *) (m + 56)
  734. xorq 56(%rsi),%rbx
  735. # *(uint64 *) (out + 56) = x14
  736. movq %rbx,56(%rdi)
  737. # bytes = bytes_backup
  738. movq 152(%rsp),%rdx
  739. # in8 = j8
  740. movq 88(%rsp),%rcx
  741. # in8 += 1
  742. add $1,%rcx
  743. # j8 = in8
  744. movq %rcx,88(%rsp)
  745. # unsigned>? unsigned<? bytes - 64
  746. cmp $64,%rdx
  747. # comment:fp stack unchanged by jump
  748. # goto bytesatleast65 if unsigned>
  749. ja ._bytesatleast65
  750. # comment:fp stack unchanged by jump
  751. # goto bytesatleast64 if !unsigned<
  752. jae ._bytesatleast64
  753. # m = out
  754. mov %rdi,%rsi
  755. # out = ctarget
  756. movq 128(%rsp),%rdi
  757. # i = bytes
  758. mov %rdx,%rcx
  759. # while (i) { *out++ = *m++; --i }
  760. rep movsb
  761. # comment:fp stack unchanged by fallthrough
  762. # bytesatleast64:
  763. ._bytesatleast64:
  764. # x = x_backup
  765. movq 120(%rsp),%rdi
  766. # in8 = j8
  767. movq 88(%rsp),%rsi
  768. # *(uint64 *) (x + 32) = in8
  769. movq %rsi,32(%rdi)
  770. # r11 = r11_stack
  771. movq 0(%rsp),%r11
  772. # r12 = r12_stack
  773. movq 8(%rsp),%r12
  774. # r13 = r13_stack
  775. movq 16(%rsp),%r13
  776. # r14 = r14_stack
  777. movq 24(%rsp),%r14
  778. # r15 = r15_stack
  779. movq 32(%rsp),%r15
  780. # rbx = rbx_stack
  781. movq 40(%rsp),%rbx
  782. # rbp = rbp_stack
  783. movq 48(%rsp),%rbp
  784. # comment:fp stack unchanged by fallthrough
  785. # done:
  786. ._done:
  787. # leave
  788. add %r11,%rsp
  789. mov %rdi,%rax
  790. mov %rsi,%rdx
  791. ret
  792. # bytesatleast65:
  793. ._bytesatleast65:
  794. # bytes -= 64
  795. sub $64,%rdx
  796. # out += 64
  797. add $64,%rdi
  798. # m += 64
  799. add $64,%rsi
  800. # comment:fp stack unchanged by jump
  801. # goto bytesatleast1
  802. jmp ._bytesatleast1
  803. ENDPROC(salsa20_encrypt_bytes)
  804. # enter salsa20_keysetup
  805. ENTRY(salsa20_keysetup)
  806. mov %rsp,%r11
  807. and $31,%r11
  808. add $256,%r11
  809. sub %r11,%rsp
  810. # k = arg2
  811. mov %rsi,%rsi
  812. # kbits = arg3
  813. mov %rdx,%rdx
  814. # x = arg1
  815. mov %rdi,%rdi
  816. # in0 = *(uint64 *) (k + 0)
  817. movq 0(%rsi),%r8
  818. # in2 = *(uint64 *) (k + 8)
  819. movq 8(%rsi),%r9
  820. # *(uint64 *) (x + 4) = in0
  821. movq %r8,4(%rdi)
  822. # *(uint64 *) (x + 12) = in2
  823. movq %r9,12(%rdi)
  824. # unsigned<? kbits - 256
  825. cmp $256,%rdx
  826. # comment:fp stack unchanged by jump
  827. # goto kbits128 if unsigned<
  828. jb ._kbits128
  829. # kbits256:
  830. ._kbits256:
  831. # in10 = *(uint64 *) (k + 16)
  832. movq 16(%rsi),%rdx
  833. # in12 = *(uint64 *) (k + 24)
  834. movq 24(%rsi),%rsi
  835. # *(uint64 *) (x + 44) = in10
  836. movq %rdx,44(%rdi)
  837. # *(uint64 *) (x + 52) = in12
  838. movq %rsi,52(%rdi)
  839. # in0 = 1634760805
  840. mov $1634760805,%rsi
  841. # in4 = 857760878
  842. mov $857760878,%rdx
  843. # in10 = 2036477234
  844. mov $2036477234,%rcx
  845. # in14 = 1797285236
  846. mov $1797285236,%r8
  847. # *(uint32 *) (x + 0) = in0
  848. movl %esi,0(%rdi)
  849. # *(uint32 *) (x + 20) = in4
  850. movl %edx,20(%rdi)
  851. # *(uint32 *) (x + 40) = in10
  852. movl %ecx,40(%rdi)
  853. # *(uint32 *) (x + 60) = in14
  854. movl %r8d,60(%rdi)
  855. # comment:fp stack unchanged by jump
  856. # goto keysetupdone
  857. jmp ._keysetupdone
  858. # kbits128:
  859. ._kbits128:
  860. # in10 = *(uint64 *) (k + 0)
  861. movq 0(%rsi),%rdx
  862. # in12 = *(uint64 *) (k + 8)
  863. movq 8(%rsi),%rsi
  864. # *(uint64 *) (x + 44) = in10
  865. movq %rdx,44(%rdi)
  866. # *(uint64 *) (x + 52) = in12
  867. movq %rsi,52(%rdi)
  868. # in0 = 1634760805
  869. mov $1634760805,%rsi
  870. # in4 = 824206446
  871. mov $824206446,%rdx
  872. # in10 = 2036477238
  873. mov $2036477238,%rcx
  874. # in14 = 1797285236
  875. mov $1797285236,%r8
  876. # *(uint32 *) (x + 0) = in0
  877. movl %esi,0(%rdi)
  878. # *(uint32 *) (x + 20) = in4
  879. movl %edx,20(%rdi)
  880. # *(uint32 *) (x + 40) = in10
  881. movl %ecx,40(%rdi)
  882. # *(uint32 *) (x + 60) = in14
  883. movl %r8d,60(%rdi)
  884. # keysetupdone:
  885. ._keysetupdone:
  886. # leave
  887. add %r11,%rsp
  888. mov %rdi,%rax
  889. mov %rsi,%rdx
  890. ret
  891. ENDPROC(salsa20_keysetup)
  892. # enter salsa20_ivsetup
  893. ENTRY(salsa20_ivsetup)
  894. mov %rsp,%r11
  895. and $31,%r11
  896. add $256,%r11
  897. sub %r11,%rsp
  898. # iv = arg2
  899. mov %rsi,%rsi
  900. # x = arg1
  901. mov %rdi,%rdi
  902. # in6 = *(uint64 *) (iv + 0)
  903. movq 0(%rsi),%rsi
  904. # in8 = 0
  905. mov $0,%r8
  906. # *(uint64 *) (x + 24) = in6
  907. movq %rsi,24(%rdi)
  908. # *(uint64 *) (x + 32) = in8
  909. movq %r8,32(%rdi)
  910. # leave
  911. add %r11,%rsp
  912. mov %rdi,%rax
  913. mov %rsi,%rdx
  914. ret
  915. ENDPROC(salsa20_ivsetup)