sha256-armv4.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Permission to use under GPL terms is granted.
  9. # ====================================================================
  10. # SHA256 block procedure for ARMv4. May 2007.
  11. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  12. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  13. # byte [on single-issue Xscale PXA250 core].
  14. # July 2010.
  15. #
  16. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  17. # Cortex A8 core and ~20 cycles per processed byte.
  18. # February 2011.
  19. #
  20. # Profiler-assisted and platform-specific optimization resulted in 16%
  21. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  22. # September 2013.
  23. #
  24. # Add NEON implementation. On Cortex A8 it was measured to process one
  25. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  26. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  27. # code (meaning that latter performs sub-optimally, nothing was done
  28. # about it).
  29. # May 2014.
  30. #
  31. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  32. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  33. open STDOUT,">$output";
  34. $ctx="r0"; $t0="r0";
  35. $inp="r1"; $t4="r1";
  36. $len="r2"; $t1="r2";
  37. $T1="r3"; $t3="r3";
  38. $A="r4";
  39. $B="r5";
  40. $C="r6";
  41. $D="r7";
  42. $E="r8";
  43. $F="r9";
  44. $G="r10";
  45. $H="r11";
  46. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  47. $t2="r12";
  48. $Ktbl="r14";
  49. @Sigma0=( 2,13,22);
  50. @Sigma1=( 6,11,25);
  51. @sigma0=( 7,18, 3);
  52. @sigma1=(17,19,10);
  53. sub BODY_00_15 {
  54. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  55. $code.=<<___ if ($i<16);
  56. #if __ARM_ARCH__>=7
  57. @ ldr $t1,[$inp],#4 @ $i
  58. # if $i==15
  59. str $inp,[sp,#17*4] @ make room for $t4
  60. # endif
  61. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  62. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  63. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  64. # ifndef __ARMEB__
  65. rev $t1,$t1
  66. # endif
  67. #else
  68. @ ldrb $t1,[$inp,#3] @ $i
  69. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  70. ldrb $t2,[$inp,#2]
  71. ldrb $t0,[$inp,#1]
  72. orr $t1,$t1,$t2,lsl#8
  73. ldrb $t2,[$inp],#4
  74. orr $t1,$t1,$t0,lsl#16
  75. # if $i==15
  76. str $inp,[sp,#17*4] @ make room for $t4
  77. # endif
  78. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  79. orr $t1,$t1,$t2,lsl#24
  80. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  81. #endif
  82. ___
  83. $code.=<<___;
  84. ldr $t2,[$Ktbl],#4 @ *K256++
  85. add $h,$h,$t1 @ h+=X[i]
  86. str $t1,[sp,#`$i%16`*4]
  87. eor $t1,$f,$g
  88. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  89. and $t1,$t1,$e
  90. add $h,$h,$t2 @ h+=K256[i]
  91. eor $t1,$t1,$g @ Ch(e,f,g)
  92. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  93. add $h,$h,$t1 @ h+=Ch(e,f,g)
  94. #if $i==31
  95. and $t2,$t2,#0xff
  96. cmp $t2,#0xf2 @ done?
  97. #endif
  98. #if $i<15
  99. # if __ARM_ARCH__>=7
  100. ldr $t1,[$inp],#4 @ prefetch
  101. # else
  102. ldrb $t1,[$inp,#3]
  103. # endif
  104. eor $t2,$a,$b @ a^b, b^c in next round
  105. #else
  106. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  107. eor $t2,$a,$b @ a^b, b^c in next round
  108. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  109. #endif
  110. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  111. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  112. add $d,$d,$h @ d+=h
  113. eor $t3,$t3,$b @ Maj(a,b,c)
  114. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  115. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  116. ___
  117. ($t2,$t3)=($t3,$t2);
  118. }
  119. sub BODY_16_XX {
  120. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  121. $code.=<<___;
  122. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  123. @ ldr $t4,[sp,#`($i+14)%16`*4]
  124. mov $t0,$t1,ror#$sigma0[0]
  125. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  126. mov $t2,$t4,ror#$sigma1[0]
  127. eor $t0,$t0,$t1,ror#$sigma0[1]
  128. eor $t2,$t2,$t4,ror#$sigma1[1]
  129. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  130. ldr $t1,[sp,#`($i+0)%16`*4]
  131. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  132. ldr $t4,[sp,#`($i+9)%16`*4]
  133. add $t2,$t2,$t0
  134. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  135. add $t1,$t1,$t2
  136. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  137. add $t1,$t1,$t4 @ X[i]
  138. ___
  139. &BODY_00_15(@_);
  140. }
  141. $code=<<___;
  142. #ifndef __KERNEL__
  143. # include "arm_arch.h"
  144. #else
  145. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  146. # define __ARM_MAX_ARCH__ 7
  147. #endif
  148. .text
  149. #if __ARM_ARCH__<7
  150. .code 32
  151. #else
  152. .syntax unified
  153. # ifdef __thumb2__
  154. # define adrl adr
  155. .thumb
  156. # else
  157. .code 32
  158. # endif
  159. #endif
  160. .type K256,%object
  161. .align 5
  162. K256:
  163. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  164. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  165. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  166. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  167. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  168. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  169. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  170. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  171. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  172. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  173. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  174. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  175. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  176. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  177. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  178. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  179. .size K256,.-K256
  180. .word 0 @ terminator
  181. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  182. .LOPENSSL_armcap:
  183. .word OPENSSL_armcap_P-sha256_block_data_order
  184. #endif
  185. .align 5
  186. .global sha256_block_data_order
  187. .type sha256_block_data_order,%function
  188. sha256_block_data_order:
  189. .Lsha256_block_data_order:
  190. #if __ARM_ARCH__<7
  191. sub r3,pc,#8 @ sha256_block_data_order
  192. #else
  193. adr r3,.Lsha256_block_data_order
  194. #endif
  195. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  196. ldr r12,.LOPENSSL_armcap
  197. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  198. tst r12,#ARMV8_SHA256
  199. bne .LARMv8
  200. tst r12,#ARMV7_NEON
  201. bne .LNEON
  202. #endif
  203. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  204. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  205. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  206. sub $Ktbl,r3,#256+32 @ K256
  207. sub sp,sp,#16*4 @ alloca(X[16])
  208. .Loop:
  209. # if __ARM_ARCH__>=7
  210. ldr $t1,[$inp],#4
  211. # else
  212. ldrb $t1,[$inp,#3]
  213. # endif
  214. eor $t3,$B,$C @ magic
  215. eor $t2,$t2,$t2
  216. ___
  217. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  218. $code.=".Lrounds_16_xx:\n";
  219. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  220. $code.=<<___;
  221. #if __ARM_ARCH__>=7
  222. ite eq @ Thumb2 thing, sanity check in ARM
  223. #endif
  224. ldreq $t3,[sp,#16*4] @ pull ctx
  225. bne .Lrounds_16_xx
  226. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  227. ldr $t0,[$t3,#0]
  228. ldr $t1,[$t3,#4]
  229. ldr $t2,[$t3,#8]
  230. add $A,$A,$t0
  231. ldr $t0,[$t3,#12]
  232. add $B,$B,$t1
  233. ldr $t1,[$t3,#16]
  234. add $C,$C,$t2
  235. ldr $t2,[$t3,#20]
  236. add $D,$D,$t0
  237. ldr $t0,[$t3,#24]
  238. add $E,$E,$t1
  239. ldr $t1,[$t3,#28]
  240. add $F,$F,$t2
  241. ldr $inp,[sp,#17*4] @ pull inp
  242. ldr $t2,[sp,#18*4] @ pull inp+len
  243. add $G,$G,$t0
  244. add $H,$H,$t1
  245. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  246. cmp $inp,$t2
  247. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  248. bne .Loop
  249. add sp,sp,#`16+3`*4 @ destroy frame
  250. #if __ARM_ARCH__>=5
  251. ldmia sp!,{r4-r11,pc}
  252. #else
  253. ldmia sp!,{r4-r11,lr}
  254. tst lr,#1
  255. moveq pc,lr @ be binary compatible with V4, yet
  256. bx lr @ interoperable with Thumb ISA:-)
  257. #endif
  258. .size sha256_block_data_order,.-sha256_block_data_order
  259. ___
  260. ######################################################################
  261. # NEON stuff
  262. #
  263. {{{
  264. my @X=map("q$_",(0..3));
  265. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  266. my $Xfer=$t4;
  267. my $j=0;
  268. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  269. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  270. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  271. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  272. my $arg = pop;
  273. $arg = "#$arg" if ($arg*1 eq $arg);
  274. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  275. }
  276. sub Xupdate()
  277. { use integer;
  278. my $body = shift;
  279. my @insns = (&$body,&$body,&$body,&$body);
  280. my ($a,$b,$c,$d,$e,$f,$g,$h);
  281. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  282. eval(shift(@insns));
  283. eval(shift(@insns));
  284. eval(shift(@insns));
  285. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  286. eval(shift(@insns));
  287. eval(shift(@insns));
  288. eval(shift(@insns));
  289. &vshr_u32 ($T2,$T0,$sigma0[0]);
  290. eval(shift(@insns));
  291. eval(shift(@insns));
  292. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  293. eval(shift(@insns));
  294. eval(shift(@insns));
  295. &vshr_u32 ($T1,$T0,$sigma0[2]);
  296. eval(shift(@insns));
  297. eval(shift(@insns));
  298. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  299. eval(shift(@insns));
  300. eval(shift(@insns));
  301. &vshr_u32 ($T3,$T0,$sigma0[1]);
  302. eval(shift(@insns));
  303. eval(shift(@insns));
  304. &veor ($T1,$T1,$T2);
  305. eval(shift(@insns));
  306. eval(shift(@insns));
  307. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  308. eval(shift(@insns));
  309. eval(shift(@insns));
  310. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  311. eval(shift(@insns));
  312. eval(shift(@insns));
  313. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  314. eval(shift(@insns));
  315. eval(shift(@insns));
  316. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  317. eval(shift(@insns));
  318. eval(shift(@insns));
  319. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  320. eval(shift(@insns));
  321. eval(shift(@insns));
  322. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  323. eval(shift(@insns));
  324. eval(shift(@insns));
  325. &veor ($T5,$T5,$T4);
  326. eval(shift(@insns));
  327. eval(shift(@insns));
  328. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  329. eval(shift(@insns));
  330. eval(shift(@insns));
  331. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  332. eval(shift(@insns));
  333. eval(shift(@insns));
  334. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  335. eval(shift(@insns));
  336. eval(shift(@insns));
  337. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  338. eval(shift(@insns));
  339. eval(shift(@insns));
  340. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  341. eval(shift(@insns));
  342. eval(shift(@insns));
  343. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  344. eval(shift(@insns));
  345. eval(shift(@insns));
  346. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  347. eval(shift(@insns));
  348. eval(shift(@insns));
  349. &veor ($T5,$T5,$T4);
  350. eval(shift(@insns));
  351. eval(shift(@insns));
  352. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  353. eval(shift(@insns));
  354. eval(shift(@insns));
  355. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  356. eval(shift(@insns));
  357. eval(shift(@insns));
  358. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  359. eval(shift(@insns));
  360. eval(shift(@insns));
  361. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  362. eval(shift(@insns));
  363. eval(shift(@insns));
  364. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  365. eval(shift(@insns));
  366. eval(shift(@insns));
  367. &vadd_i32 ($T0,$T0,@X[0]);
  368. while($#insns>=2) { eval(shift(@insns)); }
  369. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  370. eval(shift(@insns));
  371. eval(shift(@insns));
  372. push(@X,shift(@X)); # "rotate" X[]
  373. }
  374. sub Xpreload()
  375. { use integer;
  376. my $body = shift;
  377. my @insns = (&$body,&$body,&$body,&$body);
  378. my ($a,$b,$c,$d,$e,$f,$g,$h);
  379. eval(shift(@insns));
  380. eval(shift(@insns));
  381. eval(shift(@insns));
  382. eval(shift(@insns));
  383. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. &vrev32_8 (@X[0],@X[0]);
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. eval(shift(@insns));
  392. eval(shift(@insns));
  393. &vadd_i32 ($T0,$T0,@X[0]);
  394. foreach (@insns) { eval; } # remaining instructions
  395. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  396. push(@X,shift(@X)); # "rotate" X[]
  397. }
  398. sub body_00_15 () {
  399. (
  400. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  401. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  402. '&eor ($t1,$f,$g)',
  403. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  404. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  405. '&and ($t1,$t1,$e)',
  406. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  407. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  408. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  409. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  410. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  411. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  412. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  413. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  414. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  415. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  416. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  417. '&add ($d,$d,$h)', # d+=h
  418. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  419. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  420. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  421. )
  422. }
  423. $code.=<<___;
  424. #if __ARM_MAX_ARCH__>=7
  425. .arch armv7-a
  426. .fpu neon
  427. .global sha256_block_data_order_neon
  428. .type sha256_block_data_order_neon,%function
  429. .align 4
  430. sha256_block_data_order_neon:
  431. .LNEON:
  432. stmdb sp!,{r4-r12,lr}
  433. sub $H,sp,#16*4+16
  434. adrl $Ktbl,K256
  435. bic $H,$H,#15 @ align for 128-bit stores
  436. mov $t2,sp
  437. mov sp,$H @ alloca
  438. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  439. vld1.8 {@X[0]},[$inp]!
  440. vld1.8 {@X[1]},[$inp]!
  441. vld1.8 {@X[2]},[$inp]!
  442. vld1.8 {@X[3]},[$inp]!
  443. vld1.32 {$T0},[$Ktbl,:128]!
  444. vld1.32 {$T1},[$Ktbl,:128]!
  445. vld1.32 {$T2},[$Ktbl,:128]!
  446. vld1.32 {$T3},[$Ktbl,:128]!
  447. vrev32.8 @X[0],@X[0] @ yes, even on
  448. str $ctx,[sp,#64]
  449. vrev32.8 @X[1],@X[1] @ big-endian
  450. str $inp,[sp,#68]
  451. mov $Xfer,sp
  452. vrev32.8 @X[2],@X[2]
  453. str $len,[sp,#72]
  454. vrev32.8 @X[3],@X[3]
  455. str $t2,[sp,#76] @ save original sp
  456. vadd.i32 $T0,$T0,@X[0]
  457. vadd.i32 $T1,$T1,@X[1]
  458. vst1.32 {$T0},[$Xfer,:128]!
  459. vadd.i32 $T2,$T2,@X[2]
  460. vst1.32 {$T1},[$Xfer,:128]!
  461. vadd.i32 $T3,$T3,@X[3]
  462. vst1.32 {$T2},[$Xfer,:128]!
  463. vst1.32 {$T3},[$Xfer,:128]!
  464. ldmia $ctx,{$A-$H}
  465. sub $Xfer,$Xfer,#64
  466. ldr $t1,[sp,#0]
  467. eor $t2,$t2,$t2
  468. eor $t3,$B,$C
  469. b .L_00_48
  470. .align 4
  471. .L_00_48:
  472. ___
  473. &Xupdate(\&body_00_15);
  474. &Xupdate(\&body_00_15);
  475. &Xupdate(\&body_00_15);
  476. &Xupdate(\&body_00_15);
  477. $code.=<<___;
  478. teq $t1,#0 @ check for K256 terminator
  479. ldr $t1,[sp,#0]
  480. sub $Xfer,$Xfer,#64
  481. bne .L_00_48
  482. ldr $inp,[sp,#68]
  483. ldr $t0,[sp,#72]
  484. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  485. teq $inp,$t0
  486. it eq
  487. subeq $inp,$inp,#64 @ avoid SEGV
  488. vld1.8 {@X[0]},[$inp]! @ load next input block
  489. vld1.8 {@X[1]},[$inp]!
  490. vld1.8 {@X[2]},[$inp]!
  491. vld1.8 {@X[3]},[$inp]!
  492. it ne
  493. strne $inp,[sp,#68]
  494. mov $Xfer,sp
  495. ___
  496. &Xpreload(\&body_00_15);
  497. &Xpreload(\&body_00_15);
  498. &Xpreload(\&body_00_15);
  499. &Xpreload(\&body_00_15);
  500. $code.=<<___;
  501. ldr $t0,[$t1,#0]
  502. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  503. ldr $t2,[$t1,#4]
  504. ldr $t3,[$t1,#8]
  505. ldr $t4,[$t1,#12]
  506. add $A,$A,$t0 @ accumulate
  507. ldr $t0,[$t1,#16]
  508. add $B,$B,$t2
  509. ldr $t2,[$t1,#20]
  510. add $C,$C,$t3
  511. ldr $t3,[$t1,#24]
  512. add $D,$D,$t4
  513. ldr $t4,[$t1,#28]
  514. add $E,$E,$t0
  515. str $A,[$t1],#4
  516. add $F,$F,$t2
  517. str $B,[$t1],#4
  518. add $G,$G,$t3
  519. str $C,[$t1],#4
  520. add $H,$H,$t4
  521. str $D,[$t1],#4
  522. stmia $t1,{$E-$H}
  523. ittte ne
  524. movne $Xfer,sp
  525. ldrne $t1,[sp,#0]
  526. eorne $t2,$t2,$t2
  527. ldreq sp,[sp,#76] @ restore original sp
  528. itt ne
  529. eorne $t3,$B,$C
  530. bne .L_00_48
  531. ldmia sp!,{r4-r12,pc}
  532. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  533. #endif
  534. ___
  535. }}}
  536. ######################################################################
  537. # ARMv8 stuff
  538. #
  539. {{{
  540. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  541. my @MSG=map("q$_",(8..11));
  542. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  543. my $Ktbl="r3";
  544. $code.=<<___;
  545. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  546. # ifdef __thumb2__
  547. # define INST(a,b,c,d) .byte c,d|0xc,a,b
  548. # else
  549. # define INST(a,b,c,d) .byte a,b,c,d
  550. # endif
  551. .type sha256_block_data_order_armv8,%function
  552. .align 5
  553. sha256_block_data_order_armv8:
  554. .LARMv8:
  555. vld1.32 {$ABCD,$EFGH},[$ctx]
  556. # ifdef __thumb2__
  557. adr $Ktbl,.LARMv8
  558. sub $Ktbl,$Ktbl,#.LARMv8-K256
  559. # else
  560. adrl $Ktbl,K256
  561. # endif
  562. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  563. .Loop_v8:
  564. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  565. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  566. vld1.32 {$W0},[$Ktbl]!
  567. vrev32.8 @MSG[0],@MSG[0]
  568. vrev32.8 @MSG[1],@MSG[1]
  569. vrev32.8 @MSG[2],@MSG[2]
  570. vrev32.8 @MSG[3],@MSG[3]
  571. vmov $ABCD_SAVE,$ABCD @ offload
  572. vmov $EFGH_SAVE,$EFGH
  573. teq $inp,$len
  574. ___
  575. for($i=0;$i<12;$i++) {
  576. $code.=<<___;
  577. vld1.32 {$W1},[$Ktbl]!
  578. vadd.i32 $W0,$W0,@MSG[0]
  579. sha256su0 @MSG[0],@MSG[1]
  580. vmov $abcd,$ABCD
  581. sha256h $ABCD,$EFGH,$W0
  582. sha256h2 $EFGH,$abcd,$W0
  583. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  584. ___
  585. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  586. }
  587. $code.=<<___;
  588. vld1.32 {$W1},[$Ktbl]!
  589. vadd.i32 $W0,$W0,@MSG[0]
  590. vmov $abcd,$ABCD
  591. sha256h $ABCD,$EFGH,$W0
  592. sha256h2 $EFGH,$abcd,$W0
  593. vld1.32 {$W0},[$Ktbl]!
  594. vadd.i32 $W1,$W1,@MSG[1]
  595. vmov $abcd,$ABCD
  596. sha256h $ABCD,$EFGH,$W1
  597. sha256h2 $EFGH,$abcd,$W1
  598. vld1.32 {$W1},[$Ktbl]
  599. vadd.i32 $W0,$W0,@MSG[2]
  600. sub $Ktbl,$Ktbl,#256-16 @ rewind
  601. vmov $abcd,$ABCD
  602. sha256h $ABCD,$EFGH,$W0
  603. sha256h2 $EFGH,$abcd,$W0
  604. vadd.i32 $W1,$W1,@MSG[3]
  605. vmov $abcd,$ABCD
  606. sha256h $ABCD,$EFGH,$W1
  607. sha256h2 $EFGH,$abcd,$W1
  608. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  609. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  610. it ne
  611. bne .Loop_v8
  612. vst1.32 {$ABCD,$EFGH},[$ctx]
  613. ret @ bx lr
  614. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  615. #endif
  616. ___
  617. }}}
  618. $code.=<<___;
  619. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  620. .align 2
  621. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  622. .comm OPENSSL_armcap_P,4,4
  623. #endif
  624. ___
  625. open SELF,$0;
  626. while(<SELF>) {
  627. next if (/^#!/);
  628. last if (!s/^#/@/ and !/^$/);
  629. print;
  630. }
  631. close SELF;
  632. { my %opcode = (
  633. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  634. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  635. sub unsha256 {
  636. my ($mnemonic,$arg)=@_;
  637. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  638. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  639. |(($2&7)<<17)|(($2&8)<<4)
  640. |(($3&7)<<1) |(($3&8)<<2);
  641. # since ARMv7 instructions are always encoded little-endian.
  642. # correct solution is to use .inst directive, but older
  643. # assemblers don't implement it:-(
  644. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  645. $word&0xff,($word>>8)&0xff,
  646. ($word>>16)&0xff,($word>>24)&0xff,
  647. $mnemonic,$arg;
  648. }
  649. }
  650. }
  651. foreach (split($/,$code)) {
  652. s/\`([^\`]*)\`/eval $1/geo;
  653. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  654. s/\bret\b/bx lr/go or
  655. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  656. print $_,"\n";
  657. }
  658. close STDOUT; # enforce flush