ghashp8-ppc.pl 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # GHASH for for PowerISA v2.07.
  11. #
  12. # July 2014
  13. #
  14. # Accurate performance measurements are problematic, because it's
  15. # always virtualized setup with possibly throttled processor.
  16. # Relative comparison is therefore more informative. This initial
  17. # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  18. # faster than "4-bit" integer-only compiler-generated 64-bit code.
  19. # "Initial version" means that there is room for futher improvement.
  20. $flavour=shift;
  21. $output =shift;
  22. if ($flavour =~ /64/) {
  23. $SIZE_T=8;
  24. $LRSAVE=2*$SIZE_T;
  25. $STU="stdu";
  26. $POP="ld";
  27. $PUSH="std";
  28. } elsif ($flavour =~ /32/) {
  29. $SIZE_T=4;
  30. $LRSAVE=$SIZE_T;
  31. $STU="stwu";
  32. $POP="lwz";
  33. $PUSH="stw";
  34. } else { die "nonsense $flavour"; }
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  38. die "can't locate ppc-xlate.pl";
  39. open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
  40. my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
  41. my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  42. my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  43. my $vrsave="r12";
  44. $code=<<___;
  45. .machine "any"
  46. .text
  47. .globl .gcm_init_p8
  48. lis r0,0xfff0
  49. li r8,0x10
  50. mfspr $vrsave,256
  51. li r9,0x20
  52. mtspr 256,r0
  53. li r10,0x30
  54. lvx_u $H,0,r4 # load H
  55. le?xor r7,r7,r7
  56. le?addi r7,r7,0x8 # need a vperm start with 08
  57. le?lvsr 5,0,r7
  58. le?vspltisb 6,0x0f
  59. le?vxor 5,5,6 # set a b-endian mask
  60. le?vperm $H,$H,$H,5
  61. vspltisb $xC2,-16 # 0xf0
  62. vspltisb $t0,1 # one
  63. vaddubm $xC2,$xC2,$xC2 # 0xe0
  64. vxor $zero,$zero,$zero
  65. vor $xC2,$xC2,$t0 # 0xe1
  66. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  67. vsldoi $t1,$zero,$t0,1 # ...1
  68. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  69. vspltisb $t2,7
  70. vor $xC2,$xC2,$t1 # 0xc2....01
  71. vspltb $t1,$H,0 # most significant byte
  72. vsl $H,$H,$t0 # H<<=1
  73. vsrab $t1,$t1,$t2 # broadcast carry bit
  74. vand $t1,$t1,$xC2
  75. vxor $H,$H,$t1 # twisted H
  76. vsldoi $H,$H,$H,8 # twist even more ...
  77. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  78. vsldoi $Hl,$zero,$H,8 # ... and split
  79. vsldoi $Hh,$H,$zero,8
  80. stvx_u $xC2,0,r3 # save pre-computed table
  81. stvx_u $Hl,r8,r3
  82. stvx_u $H, r9,r3
  83. stvx_u $Hh,r10,r3
  84. mtspr 256,$vrsave
  85. blr
  86. .long 0
  87. .byte 0,12,0x14,0,0,0,2,0
  88. .long 0
  89. .size .gcm_init_p8,.-.gcm_init_p8
  90. .globl .gcm_gmult_p8
  91. lis r0,0xfff8
  92. li r8,0x10
  93. mfspr $vrsave,256
  94. li r9,0x20
  95. mtspr 256,r0
  96. li r10,0x30
  97. lvx_u $IN,0,$Xip # load Xi
  98. lvx_u $Hl,r8,$Htbl # load pre-computed table
  99. le?lvsl $lemask,r0,r0
  100. lvx_u $H, r9,$Htbl
  101. le?vspltisb $t0,0x07
  102. lvx_u $Hh,r10,$Htbl
  103. le?vxor $lemask,$lemask,$t0
  104. lvx_u $xC2,0,$Htbl
  105. le?vperm $IN,$IN,$IN,$lemask
  106. vxor $zero,$zero,$zero
  107. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  108. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  109. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  110. vpmsumd $t2,$Xl,$xC2 # 1st phase
  111. vsldoi $t0,$Xm,$zero,8
  112. vsldoi $t1,$zero,$Xm,8
  113. vxor $Xl,$Xl,$t0
  114. vxor $Xh,$Xh,$t1
  115. vsldoi $Xl,$Xl,$Xl,8
  116. vxor $Xl,$Xl,$t2
  117. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  118. vpmsumd $Xl,$Xl,$xC2
  119. vxor $t1,$t1,$Xh
  120. vxor $Xl,$Xl,$t1
  121. le?vperm $Xl,$Xl,$Xl,$lemask
  122. stvx_u $Xl,0,$Xip # write out Xi
  123. mtspr 256,$vrsave
  124. blr
  125. .long 0
  126. .byte 0,12,0x14,0,0,0,2,0
  127. .long 0
  128. .size .gcm_gmult_p8,.-.gcm_gmult_p8
  129. .globl .gcm_ghash_p8
  130. lis r0,0xfff8
  131. li r8,0x10
  132. mfspr $vrsave,256
  133. li r9,0x20
  134. mtspr 256,r0
  135. li r10,0x30
  136. lvx_u $Xl,0,$Xip # load Xi
  137. lvx_u $Hl,r8,$Htbl # load pre-computed table
  138. le?lvsl $lemask,r0,r0
  139. lvx_u $H, r9,$Htbl
  140. le?vspltisb $t0,0x07
  141. lvx_u $Hh,r10,$Htbl
  142. le?vxor $lemask,$lemask,$t0
  143. lvx_u $xC2,0,$Htbl
  144. le?vperm $Xl,$Xl,$Xl,$lemask
  145. vxor $zero,$zero,$zero
  146. lvx_u $IN,0,$inp
  147. addi $inp,$inp,16
  148. subi $len,$len,16
  149. le?vperm $IN,$IN,$IN,$lemask
  150. vxor $IN,$IN,$Xl
  151. b Loop
  152. .align 5
  153. Loop:
  154. subic $len,$len,16
  155. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  156. subfe. r0,r0,r0 # borrow?-1:0
  157. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  158. and r0,r0,$len
  159. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  160. add $inp,$inp,r0
  161. vpmsumd $t2,$Xl,$xC2 # 1st phase
  162. vsldoi $t0,$Xm,$zero,8
  163. vsldoi $t1,$zero,$Xm,8
  164. vxor $Xl,$Xl,$t0
  165. vxor $Xh,$Xh,$t1
  166. vsldoi $Xl,$Xl,$Xl,8
  167. vxor $Xl,$Xl,$t2
  168. lvx_u $IN,0,$inp
  169. addi $inp,$inp,16
  170. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  171. vpmsumd $Xl,$Xl,$xC2
  172. le?vperm $IN,$IN,$IN,$lemask
  173. vxor $t1,$t1,$Xh
  174. vxor $IN,$IN,$t1
  175. vxor $IN,$IN,$Xl
  176. beq Loop # did $len-=16 borrow?
  177. vxor $Xl,$Xl,$t1
  178. le?vperm $Xl,$Xl,$Xl,$lemask
  179. stvx_u $Xl,0,$Xip # write out Xi
  180. mtspr 256,$vrsave
  181. blr
  182. .long 0
  183. .byte 0,12,0x14,0,0,0,4,0
  184. .long 0
  185. .size .gcm_ghash_p8,.-.gcm_ghash_p8
  186. .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  187. .align 2
  188. ___
  189. foreach (split("\n",$code)) {
  190. if ($flavour =~ /le$/o) { # little-endian
  191. s/le\?//o or
  192. s/be\?/#be#/o;
  193. } else {
  194. s/le\?/#le#/o or
  195. s/be\?//o;
  196. }
  197. print $_,"\n";
  198. }
  199. close STDOUT; # enforce flush