poly1305-sse2-x86_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /*
  2. * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .data
  13. .align 16
  14. ANMASK: .octa 0x0000000003ffffff0000000003ffffff
  15. ORMASK: .octa 0x00000000010000000000000001000000
  16. .text
  17. #define h0 0x00(%rdi)
  18. #define h1 0x04(%rdi)
  19. #define h2 0x08(%rdi)
  20. #define h3 0x0c(%rdi)
  21. #define h4 0x10(%rdi)
  22. #define r0 0x00(%rdx)
  23. #define r1 0x04(%rdx)
  24. #define r2 0x08(%rdx)
  25. #define r3 0x0c(%rdx)
  26. #define r4 0x10(%rdx)
  27. #define s1 0x00(%rsp)
  28. #define s2 0x04(%rsp)
  29. #define s3 0x08(%rsp)
  30. #define s4 0x0c(%rsp)
  31. #define m %rsi
  32. #define h01 %xmm0
  33. #define h23 %xmm1
  34. #define h44 %xmm2
  35. #define t1 %xmm3
  36. #define t2 %xmm4
  37. #define t3 %xmm5
  38. #define t4 %xmm6
  39. #define mask %xmm7
  40. #define d0 %r8
  41. #define d1 %r9
  42. #define d2 %r10
  43. #define d3 %r11
  44. #define d4 %r12
  45. ENTRY(poly1305_block_sse2)
  46. # %rdi: Accumulator h[5]
  47. # %rsi: 16 byte input block m
  48. # %rdx: Poly1305 key r[5]
  49. # %rcx: Block count
  50. # This single block variant tries to improve performance by doing two
  51. # multiplications in parallel using SSE instructions. There is quite
  52. # some quardword packing involved, hence the speedup is marginal.
  53. push %rbx
  54. push %r12
  55. sub $0x10,%rsp
  56. # s1..s4 = r1..r4 * 5
  57. mov r1,%eax
  58. lea (%eax,%eax,4),%eax
  59. mov %eax,s1
  60. mov r2,%eax
  61. lea (%eax,%eax,4),%eax
  62. mov %eax,s2
  63. mov r3,%eax
  64. lea (%eax,%eax,4),%eax
  65. mov %eax,s3
  66. mov r4,%eax
  67. lea (%eax,%eax,4),%eax
  68. mov %eax,s4
  69. movdqa ANMASK(%rip),mask
  70. .Ldoblock:
  71. # h01 = [0, h1, 0, h0]
  72. # h23 = [0, h3, 0, h2]
  73. # h44 = [0, h4, 0, h4]
  74. movd h0,h01
  75. movd h1,t1
  76. movd h2,h23
  77. movd h3,t2
  78. movd h4,h44
  79. punpcklqdq t1,h01
  80. punpcklqdq t2,h23
  81. punpcklqdq h44,h44
  82. # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
  83. movd 0x00(m),t1
  84. movd 0x03(m),t2
  85. psrld $2,t2
  86. punpcklqdq t2,t1
  87. pand mask,t1
  88. paddd t1,h01
  89. # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
  90. movd 0x06(m),t1
  91. movd 0x09(m),t2
  92. psrld $4,t1
  93. psrld $6,t2
  94. punpcklqdq t2,t1
  95. pand mask,t1
  96. paddd t1,h23
  97. # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
  98. mov 0x0c(m),%eax
  99. shr $8,%eax
  100. or $0x01000000,%eax
  101. movd %eax,t1
  102. pshufd $0xc4,t1,t1
  103. paddd t1,h44
  104. # t1[0] = h0 * r0 + h2 * s3
  105. # t1[1] = h1 * s4 + h3 * s2
  106. movd r0,t1
  107. movd s4,t2
  108. punpcklqdq t2,t1
  109. pmuludq h01,t1
  110. movd s3,t2
  111. movd s2,t3
  112. punpcklqdq t3,t2
  113. pmuludq h23,t2
  114. paddq t2,t1
  115. # t2[0] = h0 * r1 + h2 * s4
  116. # t2[1] = h1 * r0 + h3 * s3
  117. movd r1,t2
  118. movd r0,t3
  119. punpcklqdq t3,t2
  120. pmuludq h01,t2
  121. movd s4,t3
  122. movd s3,t4
  123. punpcklqdq t4,t3
  124. pmuludq h23,t3
  125. paddq t3,t2
  126. # t3[0] = h4 * s1
  127. # t3[1] = h4 * s2
  128. movd s1,t3
  129. movd s2,t4
  130. punpcklqdq t4,t3
  131. pmuludq h44,t3
  132. # d0 = t1[0] + t1[1] + t3[0]
  133. # d1 = t2[0] + t2[1] + t3[1]
  134. movdqa t1,t4
  135. punpcklqdq t2,t4
  136. punpckhqdq t2,t1
  137. paddq t4,t1
  138. paddq t3,t1
  139. movq t1,d0
  140. psrldq $8,t1
  141. movq t1,d1
  142. # t1[0] = h0 * r2 + h2 * r0
  143. # t1[1] = h1 * r1 + h3 * s4
  144. movd r2,t1
  145. movd r1,t2
  146. punpcklqdq t2,t1
  147. pmuludq h01,t1
  148. movd r0,t2
  149. movd s4,t3
  150. punpcklqdq t3,t2
  151. pmuludq h23,t2
  152. paddq t2,t1
  153. # t2[0] = h0 * r3 + h2 * r1
  154. # t2[1] = h1 * r2 + h3 * r0
  155. movd r3,t2
  156. movd r2,t3
  157. punpcklqdq t3,t2
  158. pmuludq h01,t2
  159. movd r1,t3
  160. movd r0,t4
  161. punpcklqdq t4,t3
  162. pmuludq h23,t3
  163. paddq t3,t2
  164. # t3[0] = h4 * s3
  165. # t3[1] = h4 * s4
  166. movd s3,t3
  167. movd s4,t4
  168. punpcklqdq t4,t3
  169. pmuludq h44,t3
  170. # d2 = t1[0] + t1[1] + t3[0]
  171. # d3 = t2[0] + t2[1] + t3[1]
  172. movdqa t1,t4
  173. punpcklqdq t2,t4
  174. punpckhqdq t2,t1
  175. paddq t4,t1
  176. paddq t3,t1
  177. movq t1,d2
  178. psrldq $8,t1
  179. movq t1,d3
  180. # t1[0] = h0 * r4 + h2 * r2
  181. # t1[1] = h1 * r3 + h3 * r1
  182. movd r4,t1
  183. movd r3,t2
  184. punpcklqdq t2,t1
  185. pmuludq h01,t1
  186. movd r2,t2
  187. movd r1,t3
  188. punpcklqdq t3,t2
  189. pmuludq h23,t2
  190. paddq t2,t1
  191. # t3[0] = h4 * r0
  192. movd r0,t3
  193. pmuludq h44,t3
  194. # d4 = t1[0] + t1[1] + t3[0]
  195. movdqa t1,t4
  196. psrldq $8,t4
  197. paddq t4,t1
  198. paddq t3,t1
  199. movq t1,d4
  200. # d1 += d0 >> 26
  201. mov d0,%rax
  202. shr $26,%rax
  203. add %rax,d1
  204. # h0 = d0 & 0x3ffffff
  205. mov d0,%rbx
  206. and $0x3ffffff,%ebx
  207. # d2 += d1 >> 26
  208. mov d1,%rax
  209. shr $26,%rax
  210. add %rax,d2
  211. # h1 = d1 & 0x3ffffff
  212. mov d1,%rax
  213. and $0x3ffffff,%eax
  214. mov %eax,h1
  215. # d3 += d2 >> 26
  216. mov d2,%rax
  217. shr $26,%rax
  218. add %rax,d3
  219. # h2 = d2 & 0x3ffffff
  220. mov d2,%rax
  221. and $0x3ffffff,%eax
  222. mov %eax,h2
  223. # d4 += d3 >> 26
  224. mov d3,%rax
  225. shr $26,%rax
  226. add %rax,d4
  227. # h3 = d3 & 0x3ffffff
  228. mov d3,%rax
  229. and $0x3ffffff,%eax
  230. mov %eax,h3
  231. # h0 += (d4 >> 26) * 5
  232. mov d4,%rax
  233. shr $26,%rax
  234. lea (%rax,%rax,4),%rax
  235. add %rax,%rbx
  236. # h4 = d4 & 0x3ffffff
  237. mov d4,%rax
  238. and $0x3ffffff,%eax
  239. mov %eax,h4
  240. # h1 += h0 >> 26
  241. mov %rbx,%rax
  242. shr $26,%rax
  243. add %eax,h1
  244. # h0 = h0 & 0x3ffffff
  245. andl $0x3ffffff,%ebx
  246. mov %ebx,h0
  247. add $0x10,m
  248. dec %rcx
  249. jnz .Ldoblock
  250. add $0x10,%rsp
  251. pop %r12
  252. pop %rbx
  253. ret
  254. ENDPROC(poly1305_block_sse2)
  255. #define u0 0x00(%r8)
  256. #define u1 0x04(%r8)
  257. #define u2 0x08(%r8)
  258. #define u3 0x0c(%r8)
  259. #define u4 0x10(%r8)
  260. #define hc0 %xmm0
  261. #define hc1 %xmm1
  262. #define hc2 %xmm2
  263. #define hc3 %xmm5
  264. #define hc4 %xmm6
  265. #define ru0 %xmm7
  266. #define ru1 %xmm8
  267. #define ru2 %xmm9
  268. #define ru3 %xmm10
  269. #define ru4 %xmm11
  270. #define sv1 %xmm12
  271. #define sv2 %xmm13
  272. #define sv3 %xmm14
  273. #define sv4 %xmm15
  274. #undef d0
  275. #define d0 %r13
  276. ENTRY(poly1305_2block_sse2)
  277. # %rdi: Accumulator h[5]
  278. # %rsi: 16 byte input block m
  279. # %rdx: Poly1305 key r[5]
  280. # %rcx: Doubleblock count
  281. # %r8: Poly1305 derived key r^2 u[5]
  282. # This two-block variant further improves performance by using loop
  283. # unrolled block processing. This is more straight forward and does
  284. # less byte shuffling, but requires a second Poly1305 key r^2:
  285. # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
  286. push %rbx
  287. push %r12
  288. push %r13
  289. # combine r0,u0
  290. movd u0,ru0
  291. movd r0,t1
  292. punpcklqdq t1,ru0
  293. # combine r1,u1 and s1=r1*5,v1=u1*5
  294. movd u1,ru1
  295. movd r1,t1
  296. punpcklqdq t1,ru1
  297. movdqa ru1,sv1
  298. pslld $2,sv1
  299. paddd ru1,sv1
  300. # combine r2,u2 and s2=r2*5,v2=u2*5
  301. movd u2,ru2
  302. movd r2,t1
  303. punpcklqdq t1,ru2
  304. movdqa ru2,sv2
  305. pslld $2,sv2
  306. paddd ru2,sv2
  307. # combine r3,u3 and s3=r3*5,v3=u3*5
  308. movd u3,ru3
  309. movd r3,t1
  310. punpcklqdq t1,ru3
  311. movdqa ru3,sv3
  312. pslld $2,sv3
  313. paddd ru3,sv3
  314. # combine r4,u4 and s4=r4*5,v4=u4*5
  315. movd u4,ru4
  316. movd r4,t1
  317. punpcklqdq t1,ru4
  318. movdqa ru4,sv4
  319. pslld $2,sv4
  320. paddd ru4,sv4
  321. .Ldoblock2:
  322. # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
  323. movd 0x00(m),hc0
  324. movd 0x10(m),t1
  325. punpcklqdq t1,hc0
  326. pand ANMASK(%rip),hc0
  327. movd h0,t1
  328. paddd t1,hc0
  329. # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
  330. movd 0x03(m),hc1
  331. movd 0x13(m),t1
  332. punpcklqdq t1,hc1
  333. psrld $2,hc1
  334. pand ANMASK(%rip),hc1
  335. movd h1,t1
  336. paddd t1,hc1
  337. # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
  338. movd 0x06(m),hc2
  339. movd 0x16(m),t1
  340. punpcklqdq t1,hc2
  341. psrld $4,hc2
  342. pand ANMASK(%rip),hc2
  343. movd h2,t1
  344. paddd t1,hc2
  345. # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
  346. movd 0x09(m),hc3
  347. movd 0x19(m),t1
  348. punpcklqdq t1,hc3
  349. psrld $6,hc3
  350. pand ANMASK(%rip),hc3
  351. movd h3,t1
  352. paddd t1,hc3
  353. # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
  354. movd 0x0c(m),hc4
  355. movd 0x1c(m),t1
  356. punpcklqdq t1,hc4
  357. psrld $8,hc4
  358. por ORMASK(%rip),hc4
  359. movd h4,t1
  360. paddd t1,hc4
  361. # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
  362. movdqa ru0,t1
  363. pmuludq hc0,t1
  364. # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
  365. movdqa sv4,t2
  366. pmuludq hc1,t2
  367. paddq t2,t1
  368. # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
  369. movdqa sv3,t2
  370. pmuludq hc2,t2
  371. paddq t2,t1
  372. # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
  373. movdqa sv2,t2
  374. pmuludq hc3,t2
  375. paddq t2,t1
  376. # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
  377. movdqa sv1,t2
  378. pmuludq hc4,t2
  379. paddq t2,t1
  380. # d0 = t1[0] + t1[1]
  381. movdqa t1,t2
  382. psrldq $8,t2
  383. paddq t2,t1
  384. movq t1,d0
  385. # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
  386. movdqa ru1,t1
  387. pmuludq hc0,t1
  388. # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
  389. movdqa ru0,t2
  390. pmuludq hc1,t2
  391. paddq t2,t1
  392. # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
  393. movdqa sv4,t2
  394. pmuludq hc2,t2
  395. paddq t2,t1
  396. # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
  397. movdqa sv3,t2
  398. pmuludq hc3,t2
  399. paddq t2,t1
  400. # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
  401. movdqa sv2,t2
  402. pmuludq hc4,t2
  403. paddq t2,t1
  404. # d1 = t1[0] + t1[1]
  405. movdqa t1,t2
  406. psrldq $8,t2
  407. paddq t2,t1
  408. movq t1,d1
  409. # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
  410. movdqa ru2,t1
  411. pmuludq hc0,t1
  412. # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
  413. movdqa ru1,t2
  414. pmuludq hc1,t2
  415. paddq t2,t1
  416. # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
  417. movdqa ru0,t2
  418. pmuludq hc2,t2
  419. paddq t2,t1
  420. # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
  421. movdqa sv4,t2
  422. pmuludq hc3,t2
  423. paddq t2,t1
  424. # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
  425. movdqa sv3,t2
  426. pmuludq hc4,t2
  427. paddq t2,t1
  428. # d2 = t1[0] + t1[1]
  429. movdqa t1,t2
  430. psrldq $8,t2
  431. paddq t2,t1
  432. movq t1,d2
  433. # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
  434. movdqa ru3,t1
  435. pmuludq hc0,t1
  436. # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
  437. movdqa ru2,t2
  438. pmuludq hc1,t2
  439. paddq t2,t1
  440. # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
  441. movdqa ru1,t2
  442. pmuludq hc2,t2
  443. paddq t2,t1
  444. # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
  445. movdqa ru0,t2
  446. pmuludq hc3,t2
  447. paddq t2,t1
  448. # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
  449. movdqa sv4,t2
  450. pmuludq hc4,t2
  451. paddq t2,t1
  452. # d3 = t1[0] + t1[1]
  453. movdqa t1,t2
  454. psrldq $8,t2
  455. paddq t2,t1
  456. movq t1,d3
  457. # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
  458. movdqa ru4,t1
  459. pmuludq hc0,t1
  460. # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
  461. movdqa ru3,t2
  462. pmuludq hc1,t2
  463. paddq t2,t1
  464. # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
  465. movdqa ru2,t2
  466. pmuludq hc2,t2
  467. paddq t2,t1
  468. # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
  469. movdqa ru1,t2
  470. pmuludq hc3,t2
  471. paddq t2,t1
  472. # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
  473. movdqa ru0,t2
  474. pmuludq hc4,t2
  475. paddq t2,t1
  476. # d4 = t1[0] + t1[1]
  477. movdqa t1,t2
  478. psrldq $8,t2
  479. paddq t2,t1
  480. movq t1,d4
  481. # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
  482. # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
  483. # amount. Careful: we must not assume the carry bits 'd0 >> 26',
  484. # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
  485. # integers. It's true in a single-block implementation, but not here.
  486. # d1 += d0 >> 26
  487. mov d0,%rax
  488. shr $26,%rax
  489. add %rax,d1
  490. # h0 = d0 & 0x3ffffff
  491. mov d0,%rbx
  492. and $0x3ffffff,%ebx
  493. # d2 += d1 >> 26
  494. mov d1,%rax
  495. shr $26,%rax
  496. add %rax,d2
  497. # h1 = d1 & 0x3ffffff
  498. mov d1,%rax
  499. and $0x3ffffff,%eax
  500. mov %eax,h1
  501. # d3 += d2 >> 26
  502. mov d2,%rax
  503. shr $26,%rax
  504. add %rax,d3
  505. # h2 = d2 & 0x3ffffff
  506. mov d2,%rax
  507. and $0x3ffffff,%eax
  508. mov %eax,h2
  509. # d4 += d3 >> 26
  510. mov d3,%rax
  511. shr $26,%rax
  512. add %rax,d4
  513. # h3 = d3 & 0x3ffffff
  514. mov d3,%rax
  515. and $0x3ffffff,%eax
  516. mov %eax,h3
  517. # h0 += (d4 >> 26) * 5
  518. mov d4,%rax
  519. shr $26,%rax
  520. lea (%rax,%rax,4),%rax
  521. add %rax,%rbx
  522. # h4 = d4 & 0x3ffffff
  523. mov d4,%rax
  524. and $0x3ffffff,%eax
  525. mov %eax,h4
  526. # h1 += h0 >> 26
  527. mov %rbx,%rax
  528. shr $26,%rax
  529. add %eax,h1
  530. # h0 = h0 & 0x3ffffff
  531. andl $0x3ffffff,%ebx
  532. mov %ebx,h0
  533. add $0x20,m
  534. dec %rcx
  535. jnz .Ldoblock2
  536. pop %r13
  537. pop %r12
  538. pop %rbx
  539. ret
  540. ENDPROC(poly1305_2block_sse2)