aesp8-ppc.pl 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for AES instructions as per PowerISA
  11. # specification version 2.07, first implemented by POWER8 processor.
  12. # The module is endian-agnostic in sense that it supports both big-
  13. # and little-endian cases. Data alignment in parallelizable modes is
  14. # handled with VSX loads and stores, which implies MSR.VSX flag being
  15. # set. It should also be noted that ISA specification doesn't prohibit
  16. # alignment exceptions for these instructions on page boundaries.
  17. # Initially alignment was handled in pure AltiVec/VMX way [when data
  18. # is aligned programmatically, which in turn guarantees exception-
  19. # free execution], but it turned to hamper performance when vcipher
  20. # instructions are interleaved. It's reckoned that eventual
  21. # misalignment penalties at page boundaries are in average lower
  22. # than additional overhead in pure AltiVec approach.
  23. $flavour = shift;
  24. if ($flavour =~ /64/) {
  25. $SIZE_T =8;
  26. $LRSAVE =2*$SIZE_T;
  27. $STU ="stdu";
  28. $POP ="ld";
  29. $PUSH ="std";
  30. $UCMP ="cmpld";
  31. $SHL ="sldi";
  32. } elsif ($flavour =~ /32/) {
  33. $SIZE_T =4;
  34. $LRSAVE =$SIZE_T;
  35. $STU ="stwu";
  36. $POP ="lwz";
  37. $PUSH ="stw";
  38. $UCMP ="cmplw";
  39. $SHL ="slwi";
  40. } else { die "nonsense $flavour"; }
  41. $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  45. die "can't locate ppc-xlate.pl";
  46. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  47. $FRAME=8*$SIZE_T;
  48. $prefix="aes_p8";
  49. $sp="r1";
  50. $vrsave="r12";
  51. #########################################################################
  52. {{{ # Key setup procedures #
  53. my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
  54. my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
  55. my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
  56. $code.=<<___;
  57. .machine "any"
  58. .text
  59. .align 7
  60. rcon:
  61. .long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
  62. .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
  63. .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
  64. .long 0,0,0,0 ?asis
  65. Lconsts:
  66. mflr r0
  67. bcl 20,31,\$+4
  68. mflr $ptr #vvvvv "distance between . and rcon
  69. addi $ptr,$ptr,-0x48
  70. mtlr r0
  71. blr
  72. .long 0
  73. .byte 0,12,0x14,0,0,0,0,0
  74. .asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  75. .globl .${prefix}_set_encrypt_key
  76. Lset_encrypt_key:
  77. mflr r11
  78. $PUSH r11,$LRSAVE($sp)
  79. li $ptr,-1
  80. ${UCMP}i $inp,0
  81. beq- Lenc_key_abort # if ($inp==0) return -1;
  82. ${UCMP}i $out,0
  83. beq- Lenc_key_abort # if ($out==0) return -1;
  84. li $ptr,-2
  85. cmpwi $bits,128
  86. blt- Lenc_key_abort
  87. cmpwi $bits,256
  88. bgt- Lenc_key_abort
  89. andi. r0,$bits,0x3f
  90. bne- Lenc_key_abort
  91. lis r0,0xfff0
  92. mfspr $vrsave,256
  93. mtspr 256,r0
  94. bl Lconsts
  95. mtlr r11
  96. neg r9,$inp
  97. lvx $in0,0,$inp
  98. addi $inp,$inp,15 # 15 is not typo
  99. lvsr $key,0,r9 # borrow $key
  100. li r8,0x20
  101. cmpwi $bits,192
  102. lvx $in1,0,$inp
  103. le?vspltisb $mask,0x0f # borrow $mask
  104. lvx $rcon,0,$ptr
  105. le?vxor $key,$key,$mask # adjust for byte swap
  106. lvx $mask,r8,$ptr
  107. addi $ptr,$ptr,0x10
  108. vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
  109. li $cnt,8
  110. vxor $zero,$zero,$zero
  111. mtctr $cnt
  112. ?lvsr $outperm,0,$out
  113. vspltisb $outmask,-1
  114. lvx $outhead,0,$out
  115. ?vperm $outmask,$zero,$outmask,$outperm
  116. blt Loop128
  117. addi $inp,$inp,8
  118. beq L192
  119. addi $inp,$inp,8
  120. b L256
  121. .align 4
  122. Loop128:
  123. vperm $key,$in0,$in0,$mask # rotate-n-splat
  124. vsldoi $tmp,$zero,$in0,12 # >>32
  125. vperm $outtail,$in0,$in0,$outperm # rotate
  126. vsel $stage,$outhead,$outtail,$outmask
  127. vmr $outhead,$outtail
  128. vcipherlast $key,$key,$rcon
  129. stvx $stage,0,$out
  130. addi $out,$out,16
  131. vxor $in0,$in0,$tmp
  132. vsldoi $tmp,$zero,$tmp,12 # >>32
  133. vxor $in0,$in0,$tmp
  134. vsldoi $tmp,$zero,$tmp,12 # >>32
  135. vxor $in0,$in0,$tmp
  136. vadduwm $rcon,$rcon,$rcon
  137. vxor $in0,$in0,$key
  138. bdnz Loop128
  139. lvx $rcon,0,$ptr # last two round keys
  140. vperm $key,$in0,$in0,$mask # rotate-n-splat
  141. vsldoi $tmp,$zero,$in0,12 # >>32
  142. vperm $outtail,$in0,$in0,$outperm # rotate
  143. vsel $stage,$outhead,$outtail,$outmask
  144. vmr $outhead,$outtail
  145. vcipherlast $key,$key,$rcon
  146. stvx $stage,0,$out
  147. addi $out,$out,16
  148. vxor $in0,$in0,$tmp
  149. vsldoi $tmp,$zero,$tmp,12 # >>32
  150. vxor $in0,$in0,$tmp
  151. vsldoi $tmp,$zero,$tmp,12 # >>32
  152. vxor $in0,$in0,$tmp
  153. vadduwm $rcon,$rcon,$rcon
  154. vxor $in0,$in0,$key
  155. vperm $key,$in0,$in0,$mask # rotate-n-splat
  156. vsldoi $tmp,$zero,$in0,12 # >>32
  157. vperm $outtail,$in0,$in0,$outperm # rotate
  158. vsel $stage,$outhead,$outtail,$outmask
  159. vmr $outhead,$outtail
  160. vcipherlast $key,$key,$rcon
  161. stvx $stage,0,$out
  162. addi $out,$out,16
  163. vxor $in0,$in0,$tmp
  164. vsldoi $tmp,$zero,$tmp,12 # >>32
  165. vxor $in0,$in0,$tmp
  166. vsldoi $tmp,$zero,$tmp,12 # >>32
  167. vxor $in0,$in0,$tmp
  168. vxor $in0,$in0,$key
  169. vperm $outtail,$in0,$in0,$outperm # rotate
  170. vsel $stage,$outhead,$outtail,$outmask
  171. vmr $outhead,$outtail
  172. stvx $stage,0,$out
  173. addi $inp,$out,15 # 15 is not typo
  174. addi $out,$out,0x50
  175. li $rounds,10
  176. b Ldone
  177. .align 4
  178. L192:
  179. lvx $tmp,0,$inp
  180. li $cnt,4
  181. vperm $outtail,$in0,$in0,$outperm # rotate
  182. vsel $stage,$outhead,$outtail,$outmask
  183. vmr $outhead,$outtail
  184. stvx $stage,0,$out
  185. addi $out,$out,16
  186. vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
  187. vspltisb $key,8 # borrow $key
  188. mtctr $cnt
  189. vsububm $mask,$mask,$key # adjust the mask
  190. Loop192:
  191. vperm $key,$in1,$in1,$mask # roate-n-splat
  192. vsldoi $tmp,$zero,$in0,12 # >>32
  193. vcipherlast $key,$key,$rcon
  194. vxor $in0,$in0,$tmp
  195. vsldoi $tmp,$zero,$tmp,12 # >>32
  196. vxor $in0,$in0,$tmp
  197. vsldoi $tmp,$zero,$tmp,12 # >>32
  198. vxor $in0,$in0,$tmp
  199. vsldoi $stage,$zero,$in1,8
  200. vspltw $tmp,$in0,3
  201. vxor $tmp,$tmp,$in1
  202. vsldoi $in1,$zero,$in1,12 # >>32
  203. vadduwm $rcon,$rcon,$rcon
  204. vxor $in1,$in1,$tmp
  205. vxor $in0,$in0,$key
  206. vxor $in1,$in1,$key
  207. vsldoi $stage,$stage,$in0,8
  208. vperm $key,$in1,$in1,$mask # rotate-n-splat
  209. vsldoi $tmp,$zero,$in0,12 # >>32
  210. vperm $outtail,$stage,$stage,$outperm # rotate
  211. vsel $stage,$outhead,$outtail,$outmask
  212. vmr $outhead,$outtail
  213. vcipherlast $key,$key,$rcon
  214. stvx $stage,0,$out
  215. addi $out,$out,16
  216. vsldoi $stage,$in0,$in1,8
  217. vxor $in0,$in0,$tmp
  218. vsldoi $tmp,$zero,$tmp,12 # >>32
  219. vperm $outtail,$stage,$stage,$outperm # rotate
  220. vsel $stage,$outhead,$outtail,$outmask
  221. vmr $outhead,$outtail
  222. vxor $in0,$in0,$tmp
  223. vsldoi $tmp,$zero,$tmp,12 # >>32
  224. vxor $in0,$in0,$tmp
  225. stvx $stage,0,$out
  226. addi $out,$out,16
  227. vspltw $tmp,$in0,3
  228. vxor $tmp,$tmp,$in1
  229. vsldoi $in1,$zero,$in1,12 # >>32
  230. vadduwm $rcon,$rcon,$rcon
  231. vxor $in1,$in1,$tmp
  232. vxor $in0,$in0,$key
  233. vxor $in1,$in1,$key
  234. vperm $outtail,$in0,$in0,$outperm # rotate
  235. vsel $stage,$outhead,$outtail,$outmask
  236. vmr $outhead,$outtail
  237. stvx $stage,0,$out
  238. addi $inp,$out,15 # 15 is not typo
  239. addi $out,$out,16
  240. bdnz Loop192
  241. li $rounds,12
  242. addi $out,$out,0x20
  243. b Ldone
  244. .align 4
  245. L256:
  246. lvx $tmp,0,$inp
  247. li $cnt,7
  248. li $rounds,14
  249. vperm $outtail,$in0,$in0,$outperm # rotate
  250. vsel $stage,$outhead,$outtail,$outmask
  251. vmr $outhead,$outtail
  252. stvx $stage,0,$out
  253. addi $out,$out,16
  254. vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
  255. mtctr $cnt
  256. Loop256:
  257. vperm $key,$in1,$in1,$mask # rotate-n-splat
  258. vsldoi $tmp,$zero,$in0,12 # >>32
  259. vperm $outtail,$in1,$in1,$outperm # rotate
  260. vsel $stage,$outhead,$outtail,$outmask
  261. vmr $outhead,$outtail
  262. vcipherlast $key,$key,$rcon
  263. stvx $stage,0,$out
  264. addi $out,$out,16
  265. vxor $in0,$in0,$tmp
  266. vsldoi $tmp,$zero,$tmp,12 # >>32
  267. vxor $in0,$in0,$tmp
  268. vsldoi $tmp,$zero,$tmp,12 # >>32
  269. vxor $in0,$in0,$tmp
  270. vadduwm $rcon,$rcon,$rcon
  271. vxor $in0,$in0,$key
  272. vperm $outtail,$in0,$in0,$outperm # rotate
  273. vsel $stage,$outhead,$outtail,$outmask
  274. vmr $outhead,$outtail
  275. stvx $stage,0,$out
  276. addi $inp,$out,15 # 15 is not typo
  277. addi $out,$out,16
  278. bdz Ldone
  279. vspltw $key,$in0,3 # just splat
  280. vsldoi $tmp,$zero,$in1,12 # >>32
  281. vsbox $key,$key
  282. vxor $in1,$in1,$tmp
  283. vsldoi $tmp,$zero,$tmp,12 # >>32
  284. vxor $in1,$in1,$tmp
  285. vsldoi $tmp,$zero,$tmp,12 # >>32
  286. vxor $in1,$in1,$tmp
  287. vxor $in1,$in1,$key
  288. b Loop256
  289. .align 4
  290. Ldone:
  291. lvx $in1,0,$inp # redundant in aligned case
  292. vsel $in1,$outhead,$in1,$outmask
  293. stvx $in1,0,$inp
  294. li $ptr,0
  295. mtspr 256,$vrsave
  296. stw $rounds,0($out)
  297. Lenc_key_abort:
  298. mr r3,$ptr
  299. blr
  300. .long 0
  301. .byte 0,12,0x14,1,0,0,3,0
  302. .long 0
  303. .size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
  304. .globl .${prefix}_set_decrypt_key
  305. $STU $sp,-$FRAME($sp)
  306. mflr r10
  307. $PUSH r10,$FRAME+$LRSAVE($sp)
  308. bl Lset_encrypt_key
  309. mtlr r10
  310. cmpwi r3,0
  311. bne- Ldec_key_abort
  312. slwi $cnt,$rounds,4
  313. subi $inp,$out,240 # first round key
  314. srwi $rounds,$rounds,1
  315. add $out,$inp,$cnt # last round key
  316. mtctr $rounds
  317. Ldeckey:
  318. lwz r0, 0($inp)
  319. lwz r6, 4($inp)
  320. lwz r7, 8($inp)
  321. lwz r8, 12($inp)
  322. addi $inp,$inp,16
  323. lwz r9, 0($out)
  324. lwz r10,4($out)
  325. lwz r11,8($out)
  326. lwz r12,12($out)
  327. stw r0, 0($out)
  328. stw r6, 4($out)
  329. stw r7, 8($out)
  330. stw r8, 12($out)
  331. subi $out,$out,16
  332. stw r9, -16($inp)
  333. stw r10,-12($inp)
  334. stw r11,-8($inp)
  335. stw r12,-4($inp)
  336. bdnz Ldeckey
  337. xor r3,r3,r3 # return value
  338. Ldec_key_abort:
  339. addi $sp,$sp,$FRAME
  340. blr
  341. .long 0
  342. .byte 0,12,4,1,0x80,0,3,0
  343. .long 0
  344. .size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
  345. ___
  346. }}}
  347. #########################################################################
  348. {{{ # Single block en- and decrypt procedures #
  349. sub gen_block () {
  350. my $dir = shift;
  351. my $n = $dir eq "de" ? "n" : "";
  352. my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
  353. $code.=<<___;
  354. .globl .${prefix}_${dir}crypt
  355. lwz $rounds,240($key)
  356. lis r0,0xfc00
  357. mfspr $vrsave,256
  358. li $idx,15 # 15 is not typo
  359. mtspr 256,r0
  360. lvx v0,0,$inp
  361. neg r11,$out
  362. lvx v1,$idx,$inp
  363. lvsl v2,0,$inp # inpperm
  364. le?vspltisb v4,0x0f
  365. ?lvsl v3,0,r11 # outperm
  366. le?vxor v2,v2,v4
  367. li $idx,16
  368. vperm v0,v0,v1,v2 # align [and byte swap in LE]
  369. lvx v1,0,$key
  370. ?lvsl v5,0,$key # keyperm
  371. srwi $rounds,$rounds,1
  372. lvx v2,$idx,$key
  373. addi $idx,$idx,16
  374. subi $rounds,$rounds,1
  375. ?vperm v1,v1,v2,v5 # align round key
  376. vxor v0,v0,v1
  377. lvx v1,$idx,$key
  378. addi $idx,$idx,16
  379. mtctr $rounds
  380. Loop_${dir}c:
  381. ?vperm v2,v2,v1,v5
  382. v${n}cipher v0,v0,v2
  383. lvx v2,$idx,$key
  384. addi $idx,$idx,16
  385. ?vperm v1,v1,v2,v5
  386. v${n}cipher v0,v0,v1
  387. lvx v1,$idx,$key
  388. addi $idx,$idx,16
  389. bdnz Loop_${dir}c
  390. ?vperm v2,v2,v1,v5
  391. v${n}cipher v0,v0,v2
  392. lvx v2,$idx,$key
  393. ?vperm v1,v1,v2,v5
  394. v${n}cipherlast v0,v0,v1
  395. vspltisb v2,-1
  396. vxor v1,v1,v1
  397. li $idx,15 # 15 is not typo
  398. ?vperm v2,v1,v2,v3 # outmask
  399. le?vxor v3,v3,v4
  400. lvx v1,0,$out # outhead
  401. vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
  402. vsel v1,v1,v0,v2
  403. lvx v4,$idx,$out
  404. stvx v1,0,$out
  405. vsel v0,v0,v4,v2
  406. stvx v0,$idx,$out
  407. mtspr 256,$vrsave
  408. blr
  409. .long 0
  410. .byte 0,12,0x14,0,0,0,3,0
  411. .long 0
  412. .size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
  413. ___
  414. }
  415. &gen_block("en");
  416. &gen_block("de");
  417. }}}
  418. #########################################################################
  419. {{{ # CBC en- and decrypt procedures #
  420. my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
  421. my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
  422. my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
  423. map("v$_",(4..10));
  424. $code.=<<___;
  425. .globl .${prefix}_cbc_encrypt
  426. ${UCMP}i $len,16
  427. bltlr-
  428. cmpwi $enc,0 # test direction
  429. lis r0,0xffe0
  430. mfspr $vrsave,256
  431. mtspr 256,r0
  432. li $idx,15
  433. vxor $rndkey0,$rndkey0,$rndkey0
  434. le?vspltisb $tmp,0x0f
  435. lvx $ivec,0,$ivp # load [unaligned] iv
  436. lvsl $inpperm,0,$ivp
  437. lvx $inptail,$idx,$ivp
  438. le?vxor $inpperm,$inpperm,$tmp
  439. vperm $ivec,$ivec,$inptail,$inpperm
  440. neg r11,$inp
  441. ?lvsl $keyperm,0,$key # prepare for unaligned key
  442. lwz $rounds,240($key)
  443. lvsr $inpperm,0,r11 # prepare for unaligned load
  444. lvx $inptail,0,$inp
  445. addi $inp,$inp,15 # 15 is not typo
  446. le?vxor $inpperm,$inpperm,$tmp
  447. ?lvsr $outperm,0,$out # prepare for unaligned store
  448. vspltisb $outmask,-1
  449. lvx $outhead,0,$out
  450. ?vperm $outmask,$rndkey0,$outmask,$outperm
  451. le?vxor $outperm,$outperm,$tmp
  452. srwi $rounds,$rounds,1
  453. li $idx,16
  454. subi $rounds,$rounds,1
  455. beq Lcbc_dec
  456. Lcbc_enc:
  457. vmr $inout,$inptail
  458. lvx $inptail,0,$inp
  459. addi $inp,$inp,16
  460. mtctr $rounds
  461. subi $len,$len,16 # len-=16
  462. lvx $rndkey0,0,$key
  463. vperm $inout,$inout,$inptail,$inpperm
  464. lvx $rndkey1,$idx,$key
  465. addi $idx,$idx,16
  466. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  467. vxor $inout,$inout,$rndkey0
  468. lvx $rndkey0,$idx,$key
  469. addi $idx,$idx,16
  470. vxor $inout,$inout,$ivec
  471. Loop_cbc_enc:
  472. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  473. vcipher $inout,$inout,$rndkey1
  474. lvx $rndkey1,$idx,$key
  475. addi $idx,$idx,16
  476. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  477. vcipher $inout,$inout,$rndkey0
  478. lvx $rndkey0,$idx,$key
  479. addi $idx,$idx,16
  480. bdnz Loop_cbc_enc
  481. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  482. vcipher $inout,$inout,$rndkey1
  483. lvx $rndkey1,$idx,$key
  484. li $idx,16
  485. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  486. vcipherlast $ivec,$inout,$rndkey0
  487. ${UCMP}i $len,16
  488. vperm $tmp,$ivec,$ivec,$outperm
  489. vsel $inout,$outhead,$tmp,$outmask
  490. vmr $outhead,$tmp
  491. stvx $inout,0,$out
  492. addi $out,$out,16
  493. bge Lcbc_enc
  494. b Lcbc_done
  495. .align 4
  496. Lcbc_dec:
  497. ${UCMP}i $len,128
  498. bge _aesp8_cbc_decrypt8x
  499. vmr $tmp,$inptail
  500. lvx $inptail,0,$inp
  501. addi $inp,$inp,16
  502. mtctr $rounds
  503. subi $len,$len,16 # len-=16
  504. lvx $rndkey0,0,$key
  505. vperm $tmp,$tmp,$inptail,$inpperm
  506. lvx $rndkey1,$idx,$key
  507. addi $idx,$idx,16
  508. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  509. vxor $inout,$tmp,$rndkey0
  510. lvx $rndkey0,$idx,$key
  511. addi $idx,$idx,16
  512. Loop_cbc_dec:
  513. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  514. vncipher $inout,$inout,$rndkey1
  515. lvx $rndkey1,$idx,$key
  516. addi $idx,$idx,16
  517. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  518. vncipher $inout,$inout,$rndkey0
  519. lvx $rndkey0,$idx,$key
  520. addi $idx,$idx,16
  521. bdnz Loop_cbc_dec
  522. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  523. vncipher $inout,$inout,$rndkey1
  524. lvx $rndkey1,$idx,$key
  525. li $idx,16
  526. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  527. vncipherlast $inout,$inout,$rndkey0
  528. ${UCMP}i $len,16
  529. vxor $inout,$inout,$ivec
  530. vmr $ivec,$tmp
  531. vperm $tmp,$inout,$inout,$outperm
  532. vsel $inout,$outhead,$tmp,$outmask
  533. vmr $outhead,$tmp
  534. stvx $inout,0,$out
  535. addi $out,$out,16
  536. bge Lcbc_dec
  537. Lcbc_done:
  538. addi $out,$out,-1
  539. lvx $inout,0,$out # redundant in aligned case
  540. vsel $inout,$outhead,$inout,$outmask
  541. stvx $inout,0,$out
  542. neg $enc,$ivp # write [unaligned] iv
  543. li $idx,15 # 15 is not typo
  544. vxor $rndkey0,$rndkey0,$rndkey0
  545. vspltisb $outmask,-1
  546. le?vspltisb $tmp,0x0f
  547. ?lvsl $outperm,0,$enc
  548. ?vperm $outmask,$rndkey0,$outmask,$outperm
  549. le?vxor $outperm,$outperm,$tmp
  550. lvx $outhead,0,$ivp
  551. vperm $ivec,$ivec,$ivec,$outperm
  552. vsel $inout,$outhead,$ivec,$outmask
  553. lvx $inptail,$idx,$ivp
  554. stvx $inout,0,$ivp
  555. vsel $inout,$ivec,$inptail,$outmask
  556. stvx $inout,$idx,$ivp
  557. mtspr 256,$vrsave
  558. blr
  559. .long 0
  560. .byte 0,12,0x14,0,0,0,6,0
  561. .long 0
  562. ___
  563. #########################################################################
  564. {{ # Optimized CBC decrypt procedure #
  565. my $key_="r11";
  566. my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
  567. my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
  568. my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
  569. my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
  570. # v26-v31 last 6 round keys
  571. my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
  572. $code.=<<___;
  573. .align 5
  574. _aesp8_cbc_decrypt8x:
  575. $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
  576. li r10,`$FRAME+8*16+15`
  577. li r11,`$FRAME+8*16+31`
  578. stvx v20,r10,$sp # ABI says so
  579. addi r10,r10,32
  580. stvx v21,r11,$sp
  581. addi r11,r11,32
  582. stvx v22,r10,$sp
  583. addi r10,r10,32
  584. stvx v23,r11,$sp
  585. addi r11,r11,32
  586. stvx v24,r10,$sp
  587. addi r10,r10,32
  588. stvx v25,r11,$sp
  589. addi r11,r11,32
  590. stvx v26,r10,$sp
  591. addi r10,r10,32
  592. stvx v27,r11,$sp
  593. addi r11,r11,32
  594. stvx v28,r10,$sp
  595. addi r10,r10,32
  596. stvx v29,r11,$sp
  597. addi r11,r11,32
  598. stvx v30,r10,$sp
  599. stvx v31,r11,$sp
  600. li r0,-1
  601. stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
  602. li $x10,0x10
  603. $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  604. li $x20,0x20
  605. $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  606. li $x30,0x30
  607. $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  608. li $x40,0x40
  609. $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  610. li $x50,0x50
  611. $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  612. li $x60,0x60
  613. $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  614. li $x70,0x70
  615. mtspr 256,r0
  616. subi $rounds,$rounds,3 # -4 in total
  617. subi $len,$len,128 # bias
  618. lvx $rndkey0,$x00,$key # load key schedule
  619. lvx v30,$x10,$key
  620. addi $key,$key,0x20
  621. lvx v31,$x00,$key
  622. ?vperm $rndkey0,$rndkey0,v30,$keyperm
  623. addi $key_,$sp,$FRAME+15
  624. mtctr $rounds
  625. Load_cbc_dec_key:
  626. ?vperm v24,v30,v31,$keyperm
  627. lvx v30,$x10,$key
  628. addi $key,$key,0x20
  629. stvx v24,$x00,$key_ # off-load round[1]
  630. ?vperm v25,v31,v30,$keyperm
  631. lvx v31,$x00,$key
  632. stvx v25,$x10,$key_ # off-load round[2]
  633. addi $key_,$key_,0x20
  634. bdnz Load_cbc_dec_key
  635. lvx v26,$x10,$key
  636. ?vperm v24,v30,v31,$keyperm
  637. lvx v27,$x20,$key
  638. stvx v24,$x00,$key_ # off-load round[3]
  639. ?vperm v25,v31,v26,$keyperm
  640. lvx v28,$x30,$key
  641. stvx v25,$x10,$key_ # off-load round[4]
  642. addi $key_,$sp,$FRAME+15 # rewind $key_
  643. ?vperm v26,v26,v27,$keyperm
  644. lvx v29,$x40,$key
  645. ?vperm v27,v27,v28,$keyperm
  646. lvx v30,$x50,$key
  647. ?vperm v28,v28,v29,$keyperm
  648. lvx v31,$x60,$key
  649. ?vperm v29,v29,v30,$keyperm
  650. lvx $out0,$x70,$key # borrow $out0
  651. ?vperm v30,v30,v31,$keyperm
  652. lvx v24,$x00,$key_ # pre-load round[1]
  653. ?vperm v31,v31,$out0,$keyperm
  654. lvx v25,$x10,$key_ # pre-load round[2]
  655. #lvx $inptail,0,$inp # "caller" already did this
  656. #addi $inp,$inp,15 # 15 is not typo
  657. subi $inp,$inp,15 # undo "caller"
  658. le?li $idx,8
  659. lvx_u $in0,$x00,$inp # load first 8 "words"
  660. le?lvsl $inpperm,0,$idx
  661. le?vspltisb $tmp,0x0f
  662. lvx_u $in1,$x10,$inp
  663. le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
  664. lvx_u $in2,$x20,$inp
  665. le?vperm $in0,$in0,$in0,$inpperm
  666. lvx_u $in3,$x30,$inp
  667. le?vperm $in1,$in1,$in1,$inpperm
  668. lvx_u $in4,$x40,$inp
  669. le?vperm $in2,$in2,$in2,$inpperm
  670. vxor $out0,$in0,$rndkey0
  671. lvx_u $in5,$x50,$inp
  672. le?vperm $in3,$in3,$in3,$inpperm
  673. vxor $out1,$in1,$rndkey0
  674. lvx_u $in6,$x60,$inp
  675. le?vperm $in4,$in4,$in4,$inpperm
  676. vxor $out2,$in2,$rndkey0
  677. lvx_u $in7,$x70,$inp
  678. addi $inp,$inp,0x80
  679. le?vperm $in5,$in5,$in5,$inpperm
  680. vxor $out3,$in3,$rndkey0
  681. le?vperm $in6,$in6,$in6,$inpperm
  682. vxor $out4,$in4,$rndkey0
  683. le?vperm $in7,$in7,$in7,$inpperm
  684. vxor $out5,$in5,$rndkey0
  685. vxor $out6,$in6,$rndkey0
  686. vxor $out7,$in7,$rndkey0
  687. mtctr $rounds
  688. b Loop_cbc_dec8x
  689. .align 5
  690. Loop_cbc_dec8x:
  691. vncipher $out0,$out0,v24
  692. vncipher $out1,$out1,v24
  693. vncipher $out2,$out2,v24
  694. vncipher $out3,$out3,v24
  695. vncipher $out4,$out4,v24
  696. vncipher $out5,$out5,v24
  697. vncipher $out6,$out6,v24
  698. vncipher $out7,$out7,v24
  699. lvx v24,$x20,$key_ # round[3]
  700. addi $key_,$key_,0x20
  701. vncipher $out0,$out0,v25
  702. vncipher $out1,$out1,v25
  703. vncipher $out2,$out2,v25
  704. vncipher $out3,$out3,v25
  705. vncipher $out4,$out4,v25
  706. vncipher $out5,$out5,v25
  707. vncipher $out6,$out6,v25
  708. vncipher $out7,$out7,v25
  709. lvx v25,$x10,$key_ # round[4]
  710. bdnz Loop_cbc_dec8x
  711. subic $len,$len,128 # $len-=128
  712. vncipher $out0,$out0,v24
  713. vncipher $out1,$out1,v24
  714. vncipher $out2,$out2,v24
  715. vncipher $out3,$out3,v24
  716. vncipher $out4,$out4,v24
  717. vncipher $out5,$out5,v24
  718. vncipher $out6,$out6,v24
  719. vncipher $out7,$out7,v24
  720. subfe. r0,r0,r0 # borrow?-1:0
  721. vncipher $out0,$out0,v25
  722. vncipher $out1,$out1,v25
  723. vncipher $out2,$out2,v25
  724. vncipher $out3,$out3,v25
  725. vncipher $out4,$out4,v25
  726. vncipher $out5,$out5,v25
  727. vncipher $out6,$out6,v25
  728. vncipher $out7,$out7,v25
  729. and r0,r0,$len
  730. vncipher $out0,$out0,v26
  731. vncipher $out1,$out1,v26
  732. vncipher $out2,$out2,v26
  733. vncipher $out3,$out3,v26
  734. vncipher $out4,$out4,v26
  735. vncipher $out5,$out5,v26
  736. vncipher $out6,$out6,v26
  737. vncipher $out7,$out7,v26
  738. add $inp,$inp,r0 # $inp is adjusted in such
  739. # way that at exit from the
  740. # loop inX-in7 are loaded
  741. # with last "words"
  742. vncipher $out0,$out0,v27
  743. vncipher $out1,$out1,v27
  744. vncipher $out2,$out2,v27
  745. vncipher $out3,$out3,v27
  746. vncipher $out4,$out4,v27
  747. vncipher $out5,$out5,v27
  748. vncipher $out6,$out6,v27
  749. vncipher $out7,$out7,v27
  750. addi $key_,$sp,$FRAME+15 # rewind $key_
  751. vncipher $out0,$out0,v28
  752. vncipher $out1,$out1,v28
  753. vncipher $out2,$out2,v28
  754. vncipher $out3,$out3,v28
  755. vncipher $out4,$out4,v28
  756. vncipher $out5,$out5,v28
  757. vncipher $out6,$out6,v28
  758. vncipher $out7,$out7,v28
  759. lvx v24,$x00,$key_ # re-pre-load round[1]
  760. vncipher $out0,$out0,v29
  761. vncipher $out1,$out1,v29
  762. vncipher $out2,$out2,v29
  763. vncipher $out3,$out3,v29
  764. vncipher $out4,$out4,v29
  765. vncipher $out5,$out5,v29
  766. vncipher $out6,$out6,v29
  767. vncipher $out7,$out7,v29
  768. lvx v25,$x10,$key_ # re-pre-load round[2]
  769. vncipher $out0,$out0,v30
  770. vxor $ivec,$ivec,v31 # xor with last round key
  771. vncipher $out1,$out1,v30
  772. vxor $in0,$in0,v31
  773. vncipher $out2,$out2,v30
  774. vxor $in1,$in1,v31
  775. vncipher $out3,$out3,v30
  776. vxor $in2,$in2,v31
  777. vncipher $out4,$out4,v30
  778. vxor $in3,$in3,v31
  779. vncipher $out5,$out5,v30
  780. vxor $in4,$in4,v31
  781. vncipher $out6,$out6,v30
  782. vxor $in5,$in5,v31
  783. vncipher $out7,$out7,v30
  784. vxor $in6,$in6,v31
  785. vncipherlast $out0,$out0,$ivec
  786. vncipherlast $out1,$out1,$in0
  787. lvx_u $in0,$x00,$inp # load next input block
  788. vncipherlast $out2,$out2,$in1
  789. lvx_u $in1,$x10,$inp
  790. vncipherlast $out3,$out3,$in2
  791. le?vperm $in0,$in0,$in0,$inpperm
  792. lvx_u $in2,$x20,$inp
  793. vncipherlast $out4,$out4,$in3
  794. le?vperm $in1,$in1,$in1,$inpperm
  795. lvx_u $in3,$x30,$inp
  796. vncipherlast $out5,$out5,$in4
  797. le?vperm $in2,$in2,$in2,$inpperm
  798. lvx_u $in4,$x40,$inp
  799. vncipherlast $out6,$out6,$in5
  800. le?vperm $in3,$in3,$in3,$inpperm
  801. lvx_u $in5,$x50,$inp
  802. vncipherlast $out7,$out7,$in6
  803. le?vperm $in4,$in4,$in4,$inpperm
  804. lvx_u $in6,$x60,$inp
  805. vmr $ivec,$in7
  806. le?vperm $in5,$in5,$in5,$inpperm
  807. lvx_u $in7,$x70,$inp
  808. addi $inp,$inp,0x80
  809. le?vperm $out0,$out0,$out0,$inpperm
  810. le?vperm $out1,$out1,$out1,$inpperm
  811. stvx_u $out0,$x00,$out
  812. le?vperm $in6,$in6,$in6,$inpperm
  813. vxor $out0,$in0,$rndkey0
  814. le?vperm $out2,$out2,$out2,$inpperm
  815. stvx_u $out1,$x10,$out
  816. le?vperm $in7,$in7,$in7,$inpperm
  817. vxor $out1,$in1,$rndkey0
  818. le?vperm $out3,$out3,$out3,$inpperm
  819. stvx_u $out2,$x20,$out
  820. vxor $out2,$in2,$rndkey0
  821. le?vperm $out4,$out4,$out4,$inpperm
  822. stvx_u $out3,$x30,$out
  823. vxor $out3,$in3,$rndkey0
  824. le?vperm $out5,$out5,$out5,$inpperm
  825. stvx_u $out4,$x40,$out
  826. vxor $out4,$in4,$rndkey0
  827. le?vperm $out6,$out6,$out6,$inpperm
  828. stvx_u $out5,$x50,$out
  829. vxor $out5,$in5,$rndkey0
  830. le?vperm $out7,$out7,$out7,$inpperm
  831. stvx_u $out6,$x60,$out
  832. vxor $out6,$in6,$rndkey0
  833. stvx_u $out7,$x70,$out
  834. addi $out,$out,0x80
  835. vxor $out7,$in7,$rndkey0
  836. mtctr $rounds
  837. beq Loop_cbc_dec8x # did $len-=128 borrow?
  838. addic. $len,$len,128
  839. beq Lcbc_dec8x_done
  840. nop
  841. nop
  842. Loop_cbc_dec8x_tail: # up to 7 "words" tail...
  843. vncipher $out1,$out1,v24
  844. vncipher $out2,$out2,v24
  845. vncipher $out3,$out3,v24
  846. vncipher $out4,$out4,v24
  847. vncipher $out5,$out5,v24
  848. vncipher $out6,$out6,v24
  849. vncipher $out7,$out7,v24
  850. lvx v24,$x20,$key_ # round[3]
  851. addi $key_,$key_,0x20
  852. vncipher $out1,$out1,v25
  853. vncipher $out2,$out2,v25
  854. vncipher $out3,$out3,v25
  855. vncipher $out4,$out4,v25
  856. vncipher $out5,$out5,v25
  857. vncipher $out6,$out6,v25
  858. vncipher $out7,$out7,v25
  859. lvx v25,$x10,$key_ # round[4]
  860. bdnz Loop_cbc_dec8x_tail
  861. vncipher $out1,$out1,v24
  862. vncipher $out2,$out2,v24
  863. vncipher $out3,$out3,v24
  864. vncipher $out4,$out4,v24
  865. vncipher $out5,$out5,v24
  866. vncipher $out6,$out6,v24
  867. vncipher $out7,$out7,v24
  868. vncipher $out1,$out1,v25
  869. vncipher $out2,$out2,v25
  870. vncipher $out3,$out3,v25
  871. vncipher $out4,$out4,v25
  872. vncipher $out5,$out5,v25
  873. vncipher $out6,$out6,v25
  874. vncipher $out7,$out7,v25
  875. vncipher $out1,$out1,v26
  876. vncipher $out2,$out2,v26
  877. vncipher $out3,$out3,v26
  878. vncipher $out4,$out4,v26
  879. vncipher $out5,$out5,v26
  880. vncipher $out6,$out6,v26
  881. vncipher $out7,$out7,v26
  882. vncipher $out1,$out1,v27
  883. vncipher $out2,$out2,v27
  884. vncipher $out3,$out3,v27
  885. vncipher $out4,$out4,v27
  886. vncipher $out5,$out5,v27
  887. vncipher $out6,$out6,v27
  888. vncipher $out7,$out7,v27
  889. vncipher $out1,$out1,v28
  890. vncipher $out2,$out2,v28
  891. vncipher $out3,$out3,v28
  892. vncipher $out4,$out4,v28
  893. vncipher $out5,$out5,v28
  894. vncipher $out6,$out6,v28
  895. vncipher $out7,$out7,v28
  896. vncipher $out1,$out1,v29
  897. vncipher $out2,$out2,v29
  898. vncipher $out3,$out3,v29
  899. vncipher $out4,$out4,v29
  900. vncipher $out5,$out5,v29
  901. vncipher $out6,$out6,v29
  902. vncipher $out7,$out7,v29
  903. vncipher $out1,$out1,v30
  904. vxor $ivec,$ivec,v31 # last round key
  905. vncipher $out2,$out2,v30
  906. vxor $in1,$in1,v31
  907. vncipher $out3,$out3,v30
  908. vxor $in2,$in2,v31
  909. vncipher $out4,$out4,v30
  910. vxor $in3,$in3,v31
  911. vncipher $out5,$out5,v30
  912. vxor $in4,$in4,v31
  913. vncipher $out6,$out6,v30
  914. vxor $in5,$in5,v31
  915. vncipher $out7,$out7,v30
  916. vxor $in6,$in6,v31
  917. cmplwi $len,32 # switch($len)
  918. blt Lcbc_dec8x_one
  919. nop
  920. beq Lcbc_dec8x_two
  921. cmplwi $len,64
  922. blt Lcbc_dec8x_three
  923. nop
  924. beq Lcbc_dec8x_four
  925. cmplwi $len,96
  926. blt Lcbc_dec8x_five
  927. nop
  928. beq Lcbc_dec8x_six
  929. Lcbc_dec8x_seven:
  930. vncipherlast $out1,$out1,$ivec
  931. vncipherlast $out2,$out2,$in1
  932. vncipherlast $out3,$out3,$in2
  933. vncipherlast $out4,$out4,$in3
  934. vncipherlast $out5,$out5,$in4
  935. vncipherlast $out6,$out6,$in5
  936. vncipherlast $out7,$out7,$in6
  937. vmr $ivec,$in7
  938. le?vperm $out1,$out1,$out1,$inpperm
  939. le?vperm $out2,$out2,$out2,$inpperm
  940. stvx_u $out1,$x00,$out
  941. le?vperm $out3,$out3,$out3,$inpperm
  942. stvx_u $out2,$x10,$out
  943. le?vperm $out4,$out4,$out4,$inpperm
  944. stvx_u $out3,$x20,$out
  945. le?vperm $out5,$out5,$out5,$inpperm
  946. stvx_u $out4,$x30,$out
  947. le?vperm $out6,$out6,$out6,$inpperm
  948. stvx_u $out5,$x40,$out
  949. le?vperm $out7,$out7,$out7,$inpperm
  950. stvx_u $out6,$x50,$out
  951. stvx_u $out7,$x60,$out
  952. addi $out,$out,0x70
  953. b Lcbc_dec8x_done
  954. .align 5
  955. Lcbc_dec8x_six:
  956. vncipherlast $out2,$out2,$ivec
  957. vncipherlast $out3,$out3,$in2
  958. vncipherlast $out4,$out4,$in3
  959. vncipherlast $out5,$out5,$in4
  960. vncipherlast $out6,$out6,$in5
  961. vncipherlast $out7,$out7,$in6
  962. vmr $ivec,$in7
  963. le?vperm $out2,$out2,$out2,$inpperm
  964. le?vperm $out3,$out3,$out3,$inpperm
  965. stvx_u $out2,$x00,$out
  966. le?vperm $out4,$out4,$out4,$inpperm
  967. stvx_u $out3,$x10,$out
  968. le?vperm $out5,$out5,$out5,$inpperm
  969. stvx_u $out4,$x20,$out
  970. le?vperm $out6,$out6,$out6,$inpperm
  971. stvx_u $out5,$x30,$out
  972. le?vperm $out7,$out7,$out7,$inpperm
  973. stvx_u $out6,$x40,$out
  974. stvx_u $out7,$x50,$out
  975. addi $out,$out,0x60
  976. b Lcbc_dec8x_done
  977. .align 5
  978. Lcbc_dec8x_five:
  979. vncipherlast $out3,$out3,$ivec
  980. vncipherlast $out4,$out4,$in3
  981. vncipherlast $out5,$out5,$in4
  982. vncipherlast $out6,$out6,$in5
  983. vncipherlast $out7,$out7,$in6
  984. vmr $ivec,$in7
  985. le?vperm $out3,$out3,$out3,$inpperm
  986. le?vperm $out4,$out4,$out4,$inpperm
  987. stvx_u $out3,$x00,$out
  988. le?vperm $out5,$out5,$out5,$inpperm
  989. stvx_u $out4,$x10,$out
  990. le?vperm $out6,$out6,$out6,$inpperm
  991. stvx_u $out5,$x20,$out
  992. le?vperm $out7,$out7,$out7,$inpperm
  993. stvx_u $out6,$x30,$out
  994. stvx_u $out7,$x40,$out
  995. addi $out,$out,0x50
  996. b Lcbc_dec8x_done
  997. .align 5
  998. Lcbc_dec8x_four:
  999. vncipherlast $out4,$out4,$ivec
  1000. vncipherlast $out5,$out5,$in4
  1001. vncipherlast $out6,$out6,$in5
  1002. vncipherlast $out7,$out7,$in6
  1003. vmr $ivec,$in7
  1004. le?vperm $out4,$out4,$out4,$inpperm
  1005. le?vperm $out5,$out5,$out5,$inpperm
  1006. stvx_u $out4,$x00,$out
  1007. le?vperm $out6,$out6,$out6,$inpperm
  1008. stvx_u $out5,$x10,$out
  1009. le?vperm $out7,$out7,$out7,$inpperm
  1010. stvx_u $out6,$x20,$out
  1011. stvx_u $out7,$x30,$out
  1012. addi $out,$out,0x40
  1013. b Lcbc_dec8x_done
  1014. .align 5
  1015. Lcbc_dec8x_three:
  1016. vncipherlast $out5,$out5,$ivec
  1017. vncipherlast $out6,$out6,$in5
  1018. vncipherlast $out7,$out7,$in6
  1019. vmr $ivec,$in7
  1020. le?vperm $out5,$out5,$out5,$inpperm
  1021. le?vperm $out6,$out6,$out6,$inpperm
  1022. stvx_u $out5,$x00,$out
  1023. le?vperm $out7,$out7,$out7,$inpperm
  1024. stvx_u $out6,$x10,$out
  1025. stvx_u $out7,$x20,$out
  1026. addi $out,$out,0x30
  1027. b Lcbc_dec8x_done
  1028. .align 5
  1029. Lcbc_dec8x_two:
  1030. vncipherlast $out6,$out6,$ivec
  1031. vncipherlast $out7,$out7,$in6
  1032. vmr $ivec,$in7
  1033. le?vperm $out6,$out6,$out6,$inpperm
  1034. le?vperm $out7,$out7,$out7,$inpperm
  1035. stvx_u $out6,$x00,$out
  1036. stvx_u $out7,$x10,$out
  1037. addi $out,$out,0x20
  1038. b Lcbc_dec8x_done
  1039. .align 5
  1040. Lcbc_dec8x_one:
  1041. vncipherlast $out7,$out7,$ivec
  1042. vmr $ivec,$in7
  1043. le?vperm $out7,$out7,$out7,$inpperm
  1044. stvx_u $out7,0,$out
  1045. addi $out,$out,0x10
  1046. Lcbc_dec8x_done:
  1047. le?vperm $ivec,$ivec,$ivec,$inpperm
  1048. stvx_u $ivec,0,$ivp # write [unaligned] iv
  1049. li r10,`$FRAME+15`
  1050. li r11,`$FRAME+31`
  1051. stvx $inpperm,r10,$sp # wipe copies of round keys
  1052. addi r10,r10,32
  1053. stvx $inpperm,r11,$sp
  1054. addi r11,r11,32
  1055. stvx $inpperm,r10,$sp
  1056. addi r10,r10,32
  1057. stvx $inpperm,r11,$sp
  1058. addi r11,r11,32
  1059. stvx $inpperm,r10,$sp
  1060. addi r10,r10,32
  1061. stvx $inpperm,r11,$sp
  1062. addi r11,r11,32
  1063. stvx $inpperm,r10,$sp
  1064. addi r10,r10,32
  1065. stvx $inpperm,r11,$sp
  1066. addi r11,r11,32
  1067. mtspr 256,$vrsave
  1068. lvx v20,r10,$sp # ABI says so
  1069. addi r10,r10,32
  1070. lvx v21,r11,$sp
  1071. addi r11,r11,32
  1072. lvx v22,r10,$sp
  1073. addi r10,r10,32
  1074. lvx v23,r11,$sp
  1075. addi r11,r11,32
  1076. lvx v24,r10,$sp
  1077. addi r10,r10,32
  1078. lvx v25,r11,$sp
  1079. addi r11,r11,32
  1080. lvx v26,r10,$sp
  1081. addi r10,r10,32
  1082. lvx v27,r11,$sp
  1083. addi r11,r11,32
  1084. lvx v28,r10,$sp
  1085. addi r10,r10,32
  1086. lvx v29,r11,$sp
  1087. addi r11,r11,32
  1088. lvx v30,r10,$sp
  1089. lvx v31,r11,$sp
  1090. $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1091. $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1092. $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1093. $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1094. $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1095. $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1096. addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
  1097. blr
  1098. .long 0
  1099. .byte 0,12,0x14,0,0x80,6,6,0
  1100. .long 0
  1101. .size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
  1102. ___
  1103. }} }}}
  1104. #########################################################################
  1105. {{{ # CTR procedure[s] #
  1106. my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
  1107. my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
  1108. my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
  1109. map("v$_",(4..11));
  1110. my $dat=$tmp;
  1111. $code.=<<___;
  1112. .globl .${prefix}_ctr32_encrypt_blocks
  1113. ${UCMP}i $len,1
  1114. bltlr-
  1115. lis r0,0xfff0
  1116. mfspr $vrsave,256
  1117. mtspr 256,r0
  1118. li $idx,15
  1119. vxor $rndkey0,$rndkey0,$rndkey0
  1120. le?vspltisb $tmp,0x0f
  1121. lvx $ivec,0,$ivp # load [unaligned] iv
  1122. lvsl $inpperm,0,$ivp
  1123. lvx $inptail,$idx,$ivp
  1124. vspltisb $one,1
  1125. le?vxor $inpperm,$inpperm,$tmp
  1126. vperm $ivec,$ivec,$inptail,$inpperm
  1127. vsldoi $one,$rndkey0,$one,1
  1128. neg r11,$inp
  1129. ?lvsl $keyperm,0,$key # prepare for unaligned key
  1130. lwz $rounds,240($key)
  1131. lvsr $inpperm,0,r11 # prepare for unaligned load
  1132. lvx $inptail,0,$inp
  1133. addi $inp,$inp,15 # 15 is not typo
  1134. le?vxor $inpperm,$inpperm,$tmp
  1135. srwi $rounds,$rounds,1
  1136. li $idx,16
  1137. subi $rounds,$rounds,1
  1138. ${UCMP}i $len,8
  1139. bge _aesp8_ctr32_encrypt8x
  1140. ?lvsr $outperm,0,$out # prepare for unaligned store
  1141. vspltisb $outmask,-1
  1142. lvx $outhead,0,$out
  1143. ?vperm $outmask,$rndkey0,$outmask,$outperm
  1144. le?vxor $outperm,$outperm,$tmp
  1145. lvx $rndkey0,0,$key
  1146. mtctr $rounds
  1147. lvx $rndkey1,$idx,$key
  1148. addi $idx,$idx,16
  1149. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1150. vxor $inout,$ivec,$rndkey0
  1151. lvx $rndkey0,$idx,$key
  1152. addi $idx,$idx,16
  1153. b Loop_ctr32_enc
  1154. .align 5
  1155. Loop_ctr32_enc:
  1156. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  1157. vcipher $inout,$inout,$rndkey1
  1158. lvx $rndkey1,$idx,$key
  1159. addi $idx,$idx,16
  1160. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1161. vcipher $inout,$inout,$rndkey0
  1162. lvx $rndkey0,$idx,$key
  1163. addi $idx,$idx,16
  1164. bdnz Loop_ctr32_enc
  1165. vadduwm $ivec,$ivec,$one
  1166. vmr $dat,$inptail
  1167. lvx $inptail,0,$inp
  1168. addi $inp,$inp,16
  1169. subic. $len,$len,1 # blocks--
  1170. ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
  1171. vcipher $inout,$inout,$rndkey1
  1172. lvx $rndkey1,$idx,$key
  1173. vperm $dat,$dat,$inptail,$inpperm
  1174. li $idx,16
  1175. ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
  1176. lvx $rndkey0,0,$key
  1177. vxor $dat,$dat,$rndkey1 # last round key
  1178. vcipherlast $inout,$inout,$dat
  1179. lvx $rndkey1,$idx,$key
  1180. addi $idx,$idx,16
  1181. vperm $inout,$inout,$inout,$outperm
  1182. vsel $dat,$outhead,$inout,$outmask
  1183. mtctr $rounds
  1184. ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
  1185. vmr $outhead,$inout
  1186. vxor $inout,$ivec,$rndkey0
  1187. lvx $rndkey0,$idx,$key
  1188. addi $idx,$idx,16
  1189. stvx $dat,0,$out
  1190. addi $out,$out,16
  1191. bne Loop_ctr32_enc
  1192. addi $out,$out,-1
  1193. lvx $inout,0,$out # redundant in aligned case
  1194. vsel $inout,$outhead,$inout,$outmask
  1195. stvx $inout,0,$out
  1196. mtspr 256,$vrsave
  1197. blr
  1198. .long 0
  1199. .byte 0,12,0x14,0,0,0,6,0
  1200. .long 0
  1201. ___
  1202. #########################################################################
  1203. {{ # Optimized CTR procedure #
  1204. my $key_="r11";
  1205. my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
  1206. my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
  1207. my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
  1208. my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
  1209. # v26-v31 last 6 round keys
  1210. my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
  1211. my ($two,$three,$four)=($outhead,$outperm,$outmask);
  1212. $code.=<<___;
  1213. .align 5
  1214. _aesp8_ctr32_encrypt8x:
  1215. $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
  1216. li r10,`$FRAME+8*16+15`
  1217. li r11,`$FRAME+8*16+31`
  1218. stvx v20,r10,$sp # ABI says so
  1219. addi r10,r10,32
  1220. stvx v21,r11,$sp
  1221. addi r11,r11,32
  1222. stvx v22,r10,$sp
  1223. addi r10,r10,32
  1224. stvx v23,r11,$sp
  1225. addi r11,r11,32
  1226. stvx v24,r10,$sp
  1227. addi r10,r10,32
  1228. stvx v25,r11,$sp
  1229. addi r11,r11,32
  1230. stvx v26,r10,$sp
  1231. addi r10,r10,32
  1232. stvx v27,r11,$sp
  1233. addi r11,r11,32
  1234. stvx v28,r10,$sp
  1235. addi r10,r10,32
  1236. stvx v29,r11,$sp
  1237. addi r11,r11,32
  1238. stvx v30,r10,$sp
  1239. stvx v31,r11,$sp
  1240. li r0,-1
  1241. stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
  1242. li $x10,0x10
  1243. $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1244. li $x20,0x20
  1245. $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1246. li $x30,0x30
  1247. $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1248. li $x40,0x40
  1249. $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1250. li $x50,0x50
  1251. $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1252. li $x60,0x60
  1253. $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1254. li $x70,0x70
  1255. mtspr 256,r0
  1256. subi $rounds,$rounds,3 # -4 in total
  1257. lvx $rndkey0,$x00,$key # load key schedule
  1258. lvx v30,$x10,$key
  1259. addi $key,$key,0x20
  1260. lvx v31,$x00,$key
  1261. ?vperm $rndkey0,$rndkey0,v30,$keyperm
  1262. addi $key_,$sp,$FRAME+15
  1263. mtctr $rounds
  1264. Load_ctr32_enc_key:
  1265. ?vperm v24,v30,v31,$keyperm
  1266. lvx v30,$x10,$key
  1267. addi $key,$key,0x20
  1268. stvx v24,$x00,$key_ # off-load round[1]
  1269. ?vperm v25,v31,v30,$keyperm
  1270. lvx v31,$x00,$key
  1271. stvx v25,$x10,$key_ # off-load round[2]
  1272. addi $key_,$key_,0x20
  1273. bdnz Load_ctr32_enc_key
  1274. lvx v26,$x10,$key
  1275. ?vperm v24,v30,v31,$keyperm
  1276. lvx v27,$x20,$key
  1277. stvx v24,$x00,$key_ # off-load round[3]
  1278. ?vperm v25,v31,v26,$keyperm
  1279. lvx v28,$x30,$key
  1280. stvx v25,$x10,$key_ # off-load round[4]
  1281. addi $key_,$sp,$FRAME+15 # rewind $key_
  1282. ?vperm v26,v26,v27,$keyperm
  1283. lvx v29,$x40,$key
  1284. ?vperm v27,v27,v28,$keyperm
  1285. lvx v30,$x50,$key
  1286. ?vperm v28,v28,v29,$keyperm
  1287. lvx v31,$x60,$key
  1288. ?vperm v29,v29,v30,$keyperm
  1289. lvx $out0,$x70,$key # borrow $out0
  1290. ?vperm v30,v30,v31,$keyperm
  1291. lvx v24,$x00,$key_ # pre-load round[1]
  1292. ?vperm v31,v31,$out0,$keyperm
  1293. lvx v25,$x10,$key_ # pre-load round[2]
  1294. vadduqm $two,$one,$one
  1295. subi $inp,$inp,15 # undo "caller"
  1296. $SHL $len,$len,4
  1297. vadduqm $out1,$ivec,$one # counter values ...
  1298. vadduqm $out2,$ivec,$two
  1299. vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
  1300. le?li $idx,8
  1301. vadduqm $out3,$out1,$two
  1302. vxor $out1,$out1,$rndkey0
  1303. le?lvsl $inpperm,0,$idx
  1304. vadduqm $out4,$out2,$two
  1305. vxor $out2,$out2,$rndkey0
  1306. le?vspltisb $tmp,0x0f
  1307. vadduqm $out5,$out3,$two
  1308. vxor $out3,$out3,$rndkey0
  1309. le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
  1310. vadduqm $out6,$out4,$two
  1311. vxor $out4,$out4,$rndkey0
  1312. vadduqm $out7,$out5,$two
  1313. vxor $out5,$out5,$rndkey0
  1314. vadduqm $ivec,$out6,$two # next counter value
  1315. vxor $out6,$out6,$rndkey0
  1316. vxor $out7,$out7,$rndkey0
  1317. mtctr $rounds
  1318. b Loop_ctr32_enc8x
  1319. .align 5
  1320. Loop_ctr32_enc8x:
  1321. vcipher $out0,$out0,v24
  1322. vcipher $out1,$out1,v24
  1323. vcipher $out2,$out2,v24
  1324. vcipher $out3,$out3,v24
  1325. vcipher $out4,$out4,v24
  1326. vcipher $out5,$out5,v24
  1327. vcipher $out6,$out6,v24
  1328. vcipher $out7,$out7,v24
  1329. Loop_ctr32_enc8x_middle:
  1330. lvx v24,$x20,$key_ # round[3]
  1331. addi $key_,$key_,0x20
  1332. vcipher $out0,$out0,v25
  1333. vcipher $out1,$out1,v25
  1334. vcipher $out2,$out2,v25
  1335. vcipher $out3,$out3,v25
  1336. vcipher $out4,$out4,v25
  1337. vcipher $out5,$out5,v25
  1338. vcipher $out6,$out6,v25
  1339. vcipher $out7,$out7,v25
  1340. lvx v25,$x10,$key_ # round[4]
  1341. bdnz Loop_ctr32_enc8x
  1342. subic r11,$len,256 # $len-256, borrow $key_
  1343. vcipher $out0,$out0,v24
  1344. vcipher $out1,$out1,v24
  1345. vcipher $out2,$out2,v24
  1346. vcipher $out3,$out3,v24
  1347. vcipher $out4,$out4,v24
  1348. vcipher $out5,$out5,v24
  1349. vcipher $out6,$out6,v24
  1350. vcipher $out7,$out7,v24
  1351. subfe r0,r0,r0 # borrow?-1:0
  1352. vcipher $out0,$out0,v25
  1353. vcipher $out1,$out1,v25
  1354. vcipher $out2,$out2,v25
  1355. vcipher $out3,$out3,v25
  1356. vcipher $out4,$out4,v25
  1357. vcipher $out5,$out5,v25
  1358. vcipher $out6,$out6,v25
  1359. vcipher $out7,$out7,v25
  1360. and r0,r0,r11
  1361. addi $key_,$sp,$FRAME+15 # rewind $key_
  1362. vcipher $out0,$out0,v26
  1363. vcipher $out1,$out1,v26
  1364. vcipher $out2,$out2,v26
  1365. vcipher $out3,$out3,v26
  1366. vcipher $out4,$out4,v26
  1367. vcipher $out5,$out5,v26
  1368. vcipher $out6,$out6,v26
  1369. vcipher $out7,$out7,v26
  1370. lvx v24,$x00,$key_ # re-pre-load round[1]
  1371. subic $len,$len,129 # $len-=129
  1372. vcipher $out0,$out0,v27
  1373. addi $len,$len,1 # $len-=128 really
  1374. vcipher $out1,$out1,v27
  1375. vcipher $out2,$out2,v27
  1376. vcipher $out3,$out3,v27
  1377. vcipher $out4,$out4,v27
  1378. vcipher $out5,$out5,v27
  1379. vcipher $out6,$out6,v27
  1380. vcipher $out7,$out7,v27
  1381. lvx v25,$x10,$key_ # re-pre-load round[2]
  1382. vcipher $out0,$out0,v28
  1383. lvx_u $in0,$x00,$inp # load input
  1384. vcipher $out1,$out1,v28
  1385. lvx_u $in1,$x10,$inp
  1386. vcipher $out2,$out2,v28
  1387. lvx_u $in2,$x20,$inp
  1388. vcipher $out3,$out3,v28
  1389. lvx_u $in3,$x30,$inp
  1390. vcipher $out4,$out4,v28
  1391. lvx_u $in4,$x40,$inp
  1392. vcipher $out5,$out5,v28
  1393. lvx_u $in5,$x50,$inp
  1394. vcipher $out6,$out6,v28
  1395. lvx_u $in6,$x60,$inp
  1396. vcipher $out7,$out7,v28
  1397. lvx_u $in7,$x70,$inp
  1398. addi $inp,$inp,0x80
  1399. vcipher $out0,$out0,v29
  1400. le?vperm $in0,$in0,$in0,$inpperm
  1401. vcipher $out1,$out1,v29
  1402. le?vperm $in1,$in1,$in1,$inpperm
  1403. vcipher $out2,$out2,v29
  1404. le?vperm $in2,$in2,$in2,$inpperm
  1405. vcipher $out3,$out3,v29
  1406. le?vperm $in3,$in3,$in3,$inpperm
  1407. vcipher $out4,$out4,v29
  1408. le?vperm $in4,$in4,$in4,$inpperm
  1409. vcipher $out5,$out5,v29
  1410. le?vperm $in5,$in5,$in5,$inpperm
  1411. vcipher $out6,$out6,v29
  1412. le?vperm $in6,$in6,$in6,$inpperm
  1413. vcipher $out7,$out7,v29
  1414. le?vperm $in7,$in7,$in7,$inpperm
  1415. add $inp,$inp,r0 # $inp is adjusted in such
  1416. # way that at exit from the
  1417. # loop inX-in7 are loaded
  1418. # with last "words"
  1419. subfe. r0,r0,r0 # borrow?-1:0
  1420. vcipher $out0,$out0,v30
  1421. vxor $in0,$in0,v31 # xor with last round key
  1422. vcipher $out1,$out1,v30
  1423. vxor $in1,$in1,v31
  1424. vcipher $out2,$out2,v30
  1425. vxor $in2,$in2,v31
  1426. vcipher $out3,$out3,v30
  1427. vxor $in3,$in3,v31
  1428. vcipher $out4,$out4,v30
  1429. vxor $in4,$in4,v31
  1430. vcipher $out5,$out5,v30
  1431. vxor $in5,$in5,v31
  1432. vcipher $out6,$out6,v30
  1433. vxor $in6,$in6,v31
  1434. vcipher $out7,$out7,v30
  1435. vxor $in7,$in7,v31
  1436. bne Lctr32_enc8x_break # did $len-129 borrow?
  1437. vcipherlast $in0,$out0,$in0
  1438. vcipherlast $in1,$out1,$in1
  1439. vadduqm $out1,$ivec,$one # counter values ...
  1440. vcipherlast $in2,$out2,$in2
  1441. vadduqm $out2,$ivec,$two
  1442. vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
  1443. vcipherlast $in3,$out3,$in3
  1444. vadduqm $out3,$out1,$two
  1445. vxor $out1,$out1,$rndkey0
  1446. vcipherlast $in4,$out4,$in4
  1447. vadduqm $out4,$out2,$two
  1448. vxor $out2,$out2,$rndkey0
  1449. vcipherlast $in5,$out5,$in5
  1450. vadduqm $out5,$out3,$two
  1451. vxor $out3,$out3,$rndkey0
  1452. vcipherlast $in6,$out6,$in6
  1453. vadduqm $out6,$out4,$two
  1454. vxor $out4,$out4,$rndkey0
  1455. vcipherlast $in7,$out7,$in7
  1456. vadduqm $out7,$out5,$two
  1457. vxor $out5,$out5,$rndkey0
  1458. le?vperm $in0,$in0,$in0,$inpperm
  1459. vadduqm $ivec,$out6,$two # next counter value
  1460. vxor $out6,$out6,$rndkey0
  1461. le?vperm $in1,$in1,$in1,$inpperm
  1462. vxor $out7,$out7,$rndkey0
  1463. mtctr $rounds
  1464. vcipher $out0,$out0,v24
  1465. stvx_u $in0,$x00,$out
  1466. le?vperm $in2,$in2,$in2,$inpperm
  1467. vcipher $out1,$out1,v24
  1468. stvx_u $in1,$x10,$out
  1469. le?vperm $in3,$in3,$in3,$inpperm
  1470. vcipher $out2,$out2,v24
  1471. stvx_u $in2,$x20,$out
  1472. le?vperm $in4,$in4,$in4,$inpperm
  1473. vcipher $out3,$out3,v24
  1474. stvx_u $in3,$x30,$out
  1475. le?vperm $in5,$in5,$in5,$inpperm
  1476. vcipher $out4,$out4,v24
  1477. stvx_u $in4,$x40,$out
  1478. le?vperm $in6,$in6,$in6,$inpperm
  1479. vcipher $out5,$out5,v24
  1480. stvx_u $in5,$x50,$out
  1481. le?vperm $in7,$in7,$in7,$inpperm
  1482. vcipher $out6,$out6,v24
  1483. stvx_u $in6,$x60,$out
  1484. vcipher $out7,$out7,v24
  1485. stvx_u $in7,$x70,$out
  1486. addi $out,$out,0x80
  1487. b Loop_ctr32_enc8x_middle
  1488. .align 5
  1489. Lctr32_enc8x_break:
  1490. cmpwi $len,-0x60
  1491. blt Lctr32_enc8x_one
  1492. nop
  1493. beq Lctr32_enc8x_two
  1494. cmpwi $len,-0x40
  1495. blt Lctr32_enc8x_three
  1496. nop
  1497. beq Lctr32_enc8x_four
  1498. cmpwi $len,-0x20
  1499. blt Lctr32_enc8x_five
  1500. nop
  1501. beq Lctr32_enc8x_six
  1502. cmpwi $len,0x00
  1503. blt Lctr32_enc8x_seven
  1504. Lctr32_enc8x_eight:
  1505. vcipherlast $out0,$out0,$in0
  1506. vcipherlast $out1,$out1,$in1
  1507. vcipherlast $out2,$out2,$in2
  1508. vcipherlast $out3,$out3,$in3
  1509. vcipherlast $out4,$out4,$in4
  1510. vcipherlast $out5,$out5,$in5
  1511. vcipherlast $out6,$out6,$in6
  1512. vcipherlast $out7,$out7,$in7
  1513. le?vperm $out0,$out0,$out0,$inpperm
  1514. le?vperm $out1,$out1,$out1,$inpperm
  1515. stvx_u $out0,$x00,$out
  1516. le?vperm $out2,$out2,$out2,$inpperm
  1517. stvx_u $out1,$x10,$out
  1518. le?vperm $out3,$out3,$out3,$inpperm
  1519. stvx_u $out2,$x20,$out
  1520. le?vperm $out4,$out4,$out4,$inpperm
  1521. stvx_u $out3,$x30,$out
  1522. le?vperm $out5,$out5,$out5,$inpperm
  1523. stvx_u $out4,$x40,$out
  1524. le?vperm $out6,$out6,$out6,$inpperm
  1525. stvx_u $out5,$x50,$out
  1526. le?vperm $out7,$out7,$out7,$inpperm
  1527. stvx_u $out6,$x60,$out
  1528. stvx_u $out7,$x70,$out
  1529. addi $out,$out,0x80
  1530. b Lctr32_enc8x_done
  1531. .align 5
  1532. Lctr32_enc8x_seven:
  1533. vcipherlast $out0,$out0,$in1
  1534. vcipherlast $out1,$out1,$in2
  1535. vcipherlast $out2,$out2,$in3
  1536. vcipherlast $out3,$out3,$in4
  1537. vcipherlast $out4,$out4,$in5
  1538. vcipherlast $out5,$out5,$in6
  1539. vcipherlast $out6,$out6,$in7
  1540. le?vperm $out0,$out0,$out0,$inpperm
  1541. le?vperm $out1,$out1,$out1,$inpperm
  1542. stvx_u $out0,$x00,$out
  1543. le?vperm $out2,$out2,$out2,$inpperm
  1544. stvx_u $out1,$x10,$out
  1545. le?vperm $out3,$out3,$out3,$inpperm
  1546. stvx_u $out2,$x20,$out
  1547. le?vperm $out4,$out4,$out4,$inpperm
  1548. stvx_u $out3,$x30,$out
  1549. le?vperm $out5,$out5,$out5,$inpperm
  1550. stvx_u $out4,$x40,$out
  1551. le?vperm $out6,$out6,$out6,$inpperm
  1552. stvx_u $out5,$x50,$out
  1553. stvx_u $out6,$x60,$out
  1554. addi $out,$out,0x70
  1555. b Lctr32_enc8x_done
  1556. .align 5
  1557. Lctr32_enc8x_six:
  1558. vcipherlast $out0,$out0,$in2
  1559. vcipherlast $out1,$out1,$in3
  1560. vcipherlast $out2,$out2,$in4
  1561. vcipherlast $out3,$out3,$in5
  1562. vcipherlast $out4,$out4,$in6
  1563. vcipherlast $out5,$out5,$in7
  1564. le?vperm $out0,$out0,$out0,$inpperm
  1565. le?vperm $out1,$out1,$out1,$inpperm
  1566. stvx_u $out0,$x00,$out
  1567. le?vperm $out2,$out2,$out2,$inpperm
  1568. stvx_u $out1,$x10,$out
  1569. le?vperm $out3,$out3,$out3,$inpperm
  1570. stvx_u $out2,$x20,$out
  1571. le?vperm $out4,$out4,$out4,$inpperm
  1572. stvx_u $out3,$x30,$out
  1573. le?vperm $out5,$out5,$out5,$inpperm
  1574. stvx_u $out4,$x40,$out
  1575. stvx_u $out5,$x50,$out
  1576. addi $out,$out,0x60
  1577. b Lctr32_enc8x_done
  1578. .align 5
  1579. Lctr32_enc8x_five:
  1580. vcipherlast $out0,$out0,$in3
  1581. vcipherlast $out1,$out1,$in4
  1582. vcipherlast $out2,$out2,$in5
  1583. vcipherlast $out3,$out3,$in6
  1584. vcipherlast $out4,$out4,$in7
  1585. le?vperm $out0,$out0,$out0,$inpperm
  1586. le?vperm $out1,$out1,$out1,$inpperm
  1587. stvx_u $out0,$x00,$out
  1588. le?vperm $out2,$out2,$out2,$inpperm
  1589. stvx_u $out1,$x10,$out
  1590. le?vperm $out3,$out3,$out3,$inpperm
  1591. stvx_u $out2,$x20,$out
  1592. le?vperm $out4,$out4,$out4,$inpperm
  1593. stvx_u $out3,$x30,$out
  1594. stvx_u $out4,$x40,$out
  1595. addi $out,$out,0x50
  1596. b Lctr32_enc8x_done
  1597. .align 5
  1598. Lctr32_enc8x_four:
  1599. vcipherlast $out0,$out0,$in4
  1600. vcipherlast $out1,$out1,$in5
  1601. vcipherlast $out2,$out2,$in6
  1602. vcipherlast $out3,$out3,$in7
  1603. le?vperm $out0,$out0,$out0,$inpperm
  1604. le?vperm $out1,$out1,$out1,$inpperm
  1605. stvx_u $out0,$x00,$out
  1606. le?vperm $out2,$out2,$out2,$inpperm
  1607. stvx_u $out1,$x10,$out
  1608. le?vperm $out3,$out3,$out3,$inpperm
  1609. stvx_u $out2,$x20,$out
  1610. stvx_u $out3,$x30,$out
  1611. addi $out,$out,0x40
  1612. b Lctr32_enc8x_done
  1613. .align 5
  1614. Lctr32_enc8x_three:
  1615. vcipherlast $out0,$out0,$in5
  1616. vcipherlast $out1,$out1,$in6
  1617. vcipherlast $out2,$out2,$in7
  1618. le?vperm $out0,$out0,$out0,$inpperm
  1619. le?vperm $out1,$out1,$out1,$inpperm
  1620. stvx_u $out0,$x00,$out
  1621. le?vperm $out2,$out2,$out2,$inpperm
  1622. stvx_u $out1,$x10,$out
  1623. stvx_u $out2,$x20,$out
  1624. addi $out,$out,0x30
  1625. b Lcbc_dec8x_done
  1626. .align 5
  1627. Lctr32_enc8x_two:
  1628. vcipherlast $out0,$out0,$in6
  1629. vcipherlast $out1,$out1,$in7
  1630. le?vperm $out0,$out0,$out0,$inpperm
  1631. le?vperm $out1,$out1,$out1,$inpperm
  1632. stvx_u $out0,$x00,$out
  1633. stvx_u $out1,$x10,$out
  1634. addi $out,$out,0x20
  1635. b Lcbc_dec8x_done
  1636. .align 5
  1637. Lctr32_enc8x_one:
  1638. vcipherlast $out0,$out0,$in7
  1639. le?vperm $out0,$out0,$out0,$inpperm
  1640. stvx_u $out0,0,$out
  1641. addi $out,$out,0x10
  1642. Lctr32_enc8x_done:
  1643. li r10,`$FRAME+15`
  1644. li r11,`$FRAME+31`
  1645. stvx $inpperm,r10,$sp # wipe copies of round keys
  1646. addi r10,r10,32
  1647. stvx $inpperm,r11,$sp
  1648. addi r11,r11,32
  1649. stvx $inpperm,r10,$sp
  1650. addi r10,r10,32
  1651. stvx $inpperm,r11,$sp
  1652. addi r11,r11,32
  1653. stvx $inpperm,r10,$sp
  1654. addi r10,r10,32
  1655. stvx $inpperm,r11,$sp
  1656. addi r11,r11,32
  1657. stvx $inpperm,r10,$sp
  1658. addi r10,r10,32
  1659. stvx $inpperm,r11,$sp
  1660. addi r11,r11,32
  1661. mtspr 256,$vrsave
  1662. lvx v20,r10,$sp # ABI says so
  1663. addi r10,r10,32
  1664. lvx v21,r11,$sp
  1665. addi r11,r11,32
  1666. lvx v22,r10,$sp
  1667. addi r10,r10,32
  1668. lvx v23,r11,$sp
  1669. addi r11,r11,32
  1670. lvx v24,r10,$sp
  1671. addi r10,r10,32
  1672. lvx v25,r11,$sp
  1673. addi r11,r11,32
  1674. lvx v26,r10,$sp
  1675. addi r10,r10,32
  1676. lvx v27,r11,$sp
  1677. addi r11,r11,32
  1678. lvx v28,r10,$sp
  1679. addi r10,r10,32
  1680. lvx v29,r11,$sp
  1681. addi r11,r11,32
  1682. lvx v30,r10,$sp
  1683. lvx v31,r11,$sp
  1684. $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
  1685. $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
  1686. $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
  1687. $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
  1688. $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
  1689. $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
  1690. addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
  1691. blr
  1692. .long 0
  1693. .byte 0,12,0x14,0,0x80,6,6,0
  1694. .long 0
  1695. .size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
  1696. ___
  1697. }} }}}
  1698. my $consts=1;
  1699. foreach(split("\n",$code)) {
  1700. s/\`([^\`]*)\`/eval($1)/geo;
  1701. # constants table endian-specific conversion
  1702. if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
  1703. my $conv=$3;
  1704. my @bytes=();
  1705. # convert to endian-agnostic format
  1706. if ($1 eq "long") {
  1707. foreach (split(/,\s*/,$2)) {
  1708. my $l = /^0/?oct:int;
  1709. push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
  1710. }
  1711. } else {
  1712. @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
  1713. }
  1714. # little-endian conversion
  1715. if ($flavour =~ /le$/o) {
  1716. SWITCH: for($conv) {
  1717. /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
  1718. /\?rev/ && do { @bytes=reverse(@bytes); last; };
  1719. }
  1720. }
  1721. #emit
  1722. print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
  1723. next;
  1724. }
  1725. $consts=0 if (m/Lconsts:/o); # end of table
  1726. # instructions prefixed with '?' are endian-specific and need
  1727. # to be adjusted accordingly...
  1728. if ($flavour =~ /le$/o) { # little-endian
  1729. s/le\?//o or
  1730. s/be\?/#be#/o or
  1731. s/\?lvsr/lvsl/o or
  1732. s/\?lvsl/lvsr/o or
  1733. s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
  1734. s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
  1735. s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
  1736. } else { # big-endian
  1737. s/le\?/#le#/o or
  1738. s/be\?//o or
  1739. s/\?([a-z]+)/$1/o;
  1740. }
  1741. print $_,"\n";
  1742. }
  1743. close STDOUT;