123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784 |
- /*
- * Implement AES algorithm in Intel AES-NI instructions.
- *
- * The white paper of AES-NI instructions can be downloaded from:
- * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
- *
- * Copyright (C) 2008, Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Vinodh Gopal <vinodh.gopal@intel.com>
- * Kahraman Akdemir
- *
- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
- * interface for 64-bit kernels.
- * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
- * Aidan O'Mahony (aidan.o.mahony@intel.com)
- * Adrian Hoban <adrian.hoban@intel.com>
- * James Guilford (james.guilford@intel.com)
- * Gabriele Paoloni <gabriele.paoloni@intel.com>
- * Tadeusz Struk (tadeusz.struk@intel.com)
- * Wajdi Feghali (wajdi.k.feghali@intel.com)
- * Copyright (c) 2010, Intel Corporation.
- *
- * Ported x86_64 version to x86:
- * Author: Mathias Krause <minipli@googlemail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
- #include <linux/linkage.h>
- #include <asm/inst.h>
- #include <asm/nospec-branch.h>
- /*
- * The following macros are used to move an (un)aligned 16 byte value to/from
- * an XMM register. This can done for either FP or integer values, for FP use
- * movaps (move aligned packed single) or integer use movdqa (move double quad
- * aligned). It doesn't make a performance difference which instruction is used
- * since Nehalem (original Core i7) was released. However, the movaps is a byte
- * shorter, so that is the one we'll use for now. (same for unaligned).
- */
- #define MOVADQ movaps
- #define MOVUDQ movups
- #ifdef __x86_64__
- .data
- .align 16
- .Lgf128mul_x_ble_mask:
- .octa 0x00000000000000010000000000000087
- POLY: .octa 0xC2000000000000000000000000000001
- TWOONE: .octa 0x00000001000000000000000000000001
- # order of these constants should not change.
- # more specifically, ALL_F should follow SHIFT_MASK,
- # and ZERO should follow ALL_F
- SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
- MASK1: .octa 0x0000000000000000ffffffffffffffff
- MASK2: .octa 0xffffffffffffffff0000000000000000
- SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
- ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- ZERO: .octa 0x00000000000000000000000000000000
- ONE: .octa 0x00000000000000000000000000000001
- F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
- dec: .octa 0x1
- enc: .octa 0x2
- .text
- #define STACK_OFFSET 8*3
- #define HashKey 16*0 // store HashKey <<1 mod poly here
- #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
- #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
- #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
- #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
- // bits of HashKey <<1 mod poly here
- //(for Karatsuba purposes)
- #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
- // bits of HashKey^2 <<1 mod poly here
- // (for Karatsuba purposes)
- #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
- // bits of HashKey^3 <<1 mod poly here
- // (for Karatsuba purposes)
- #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
- // bits of HashKey^4 <<1 mod poly here
- // (for Karatsuba purposes)
- #define VARIABLE_OFFSET 16*8
- #define arg1 rdi
- #define arg2 rsi
- #define arg3 rdx
- #define arg4 rcx
- #define arg5 r8
- #define arg6 r9
- #define arg7 STACK_OFFSET+8(%r14)
- #define arg8 STACK_OFFSET+16(%r14)
- #define arg9 STACK_OFFSET+24(%r14)
- #define arg10 STACK_OFFSET+32(%r14)
- #define keysize 2*15*16(%arg1)
- #endif
- #define STATE1 %xmm0
- #define STATE2 %xmm4
- #define STATE3 %xmm5
- #define STATE4 %xmm6
- #define STATE STATE1
- #define IN1 %xmm1
- #define IN2 %xmm7
- #define IN3 %xmm8
- #define IN4 %xmm9
- #define IN IN1
- #define KEY %xmm2
- #define IV %xmm3
- #define BSWAP_MASK %xmm10
- #define CTR %xmm11
- #define INC %xmm12
- #define GF128MUL_MASK %xmm10
- #ifdef __x86_64__
- #define AREG %rax
- #define KEYP %rdi
- #define OUTP %rsi
- #define UKEYP OUTP
- #define INP %rdx
- #define LEN %rcx
- #define IVP %r8
- #define KLEN %r9d
- #define T1 %r10
- #define TKEYP T1
- #define T2 %r11
- #define TCTR_LOW T2
- #else
- #define AREG %eax
- #define KEYP %edi
- #define OUTP AREG
- #define UKEYP OUTP
- #define INP %edx
- #define LEN %esi
- #define IVP %ebp
- #define KLEN %ebx
- #define T1 %ecx
- #define TKEYP T1
- #endif
- #ifdef __x86_64__
- /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
- *
- *
- * Input: A and B (128-bits each, bit-reflected)
- * Output: C = A*B*x mod poly, (i.e. >>1 )
- * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
- * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
- *
- */
- .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
- movdqa \GH, \TMP1
- pshufd $78, \GH, \TMP2
- pshufd $78, \HK, \TMP3
- pxor \GH, \TMP2 # TMP2 = a1+a0
- pxor \HK, \TMP3 # TMP3 = b1+b0
- PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
- PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
- pxor \GH, \TMP2
- pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \GH
- pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
- # first phase of the reduction
- movdqa \GH, \TMP2
- movdqa \GH, \TMP3
- movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- pslld $31, \TMP2 # packed right shift <<31
- pslld $30, \TMP3 # packed right shift <<30
- pslld $25, \TMP4 # packed right shift <<25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift TMP5 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \GH
- # second phase of the reduction
- movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- movdqa \GH,\TMP3
- movdqa \GH,\TMP4
- psrld $1,\TMP2 # packed left shift >>1
- psrld $2,\TMP3 # packed left shift >>2
- psrld $7,\TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \GH
- pxor \TMP1, \GH # result is in TMP1
- .endm
- /*
- * if a = number of total plaintext bytes
- * b = floor(a/16)
- * num_initial_blocks = b mod 4
- * encrypt the initial num_initial_blocks blocks and apply ghash on
- * the ciphertext
- * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
- * are clobbered
- * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
- */
- .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
- MOVADQ SHUF_MASK(%rip), %xmm14
- mov arg7, %r10 # %r10 = AAD
- mov arg8, %r12 # %r12 = aadLen
- mov %r12, %r11
- pxor %xmm\i, %xmm\i
- _get_AAD_loop\num_initial_blocks\operation:
- movd (%r10), \TMP1
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
- add $4, %r10
- sub $4, %r12
- jne _get_AAD_loop\num_initial_blocks\operation
- cmp $16, %r11
- je _get_AAD_loop2_done\num_initial_blocks\operation
- mov $16, %r12
- _get_AAD_loop2\num_initial_blocks\operation:
- psrldq $4, %xmm\i
- sub $4, %r12
- cmp %r11, %r12
- jne _get_AAD_loop2\num_initial_blocks\operation
- _get_AAD_loop2_done\num_initial_blocks\operation:
- PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
- xor %r11, %r11 # initialise the data pointer offset as zero
- # start AES for num_initial_blocks blocks
- mov %arg5, %rax # %rax = *Y0
- movdqu (%rax), \XMM0 # XMM0 = Y0
- PSHUFB_XMM %xmm14, \XMM0
- .if (\i == 5) || (\i == 6) || (\i == 7)
- MOVADQ ONE(%RIP),\TMP1
- MOVADQ (%arg1),\TMP2
- .irpc index, \i_seq
- paddd \TMP1, \XMM0 # INCR Y0
- movdqa \XMM0, %xmm\index
- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
- pxor \TMP2, %xmm\index
- .endr
- lea 0x10(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- aes_loop_initial_dec\num_initial_blocks:
- MOVADQ (%r10),\TMP1
- .irpc index, \i_seq
- AESENC \TMP1, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_initial_dec\num_initial_blocks
- MOVADQ (%r10), \TMP1
- .irpc index, \i_seq
- AESENCLAST \TMP1, %xmm\index # Last Round
- .endr
- .irpc index, \i_seq
- movdqu (%arg3 , %r11, 1), \TMP1
- pxor \TMP1, %xmm\index
- movdqu %xmm\index, (%arg2 , %r11, 1)
- # write back plaintext/ciphertext for num_initial_blocks
- add $16, %r11
- movdqa \TMP1, %xmm\index
- PSHUFB_XMM %xmm14, %xmm\index
- # prepare plaintext/ciphertext for GHASH computation
- .endr
- .endif
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- # apply GHASH on num_initial_blocks blocks
- .if \i == 5
- pxor %xmm5, %xmm6
- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .elseif \i == 6
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .elseif \i == 7
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .endif
- cmp $64, %r13
- jl _initial_blocks_done\num_initial_blocks\operation
- # no need for precomputed values
- /*
- *
- * Precomputations for HashKey parallel with encryption of first 4 blocks.
- * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- */
- MOVADQ ONE(%rip), \TMP1
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM1
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM2
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM3
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM4
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- MOVADQ 0(%arg1),\TMP1
- pxor \TMP1, \XMM1
- pxor \TMP1, \XMM2
- pxor \TMP1, \XMM3
- pxor \TMP1, \XMM4
- movdqa \TMP3, \TMP5
- pshufd $78, \TMP3, \TMP1
- pxor \TMP3, \TMP1
- movdqa \TMP1, HashKey_k(%rsp)
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^2<<1 (mod poly)
- movdqa \TMP5, HashKey_2(%rsp)
- # HashKey_2 = HashKey^2<<1 (mod poly)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_2_k(%rsp)
- .irpc index, 1234 # do 4 rounds
- movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- .endr
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_3(%rsp)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_3_k(%rsp)
- .irpc index, 56789 # do next 5 rounds
- movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- .endr
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_4(%rsp)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_4_k(%rsp)
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_pre_dec_done\num_initial_blocks
- aes_loop_pre_dec\num_initial_blocks:
- MOVADQ (%r10),\TMP2
- .irpc index, 1234
- AESENC \TMP2, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_pre_dec\num_initial_blocks
- aes_loop_pre_dec_done\num_initial_blocks:
- MOVADQ (%r10), \TMP2
- AESENCLAST \TMP2, \XMM1
- AESENCLAST \TMP2, \XMM2
- AESENCLAST \TMP2, \XMM3
- AESENCLAST \TMP2, \XMM4
- movdqu 16*0(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM1
- movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
- movdqa \TMP1, \XMM1
- movdqu 16*1(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM2
- movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
- movdqa \TMP1, \XMM2
- movdqu 16*2(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM3
- movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
- movdqa \TMP1, \XMM3
- movdqu 16*3(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM4
- movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
- movdqa \TMP1, \XMM4
- add $64, %r11
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- pxor \XMMDst, \XMM1
- # combine GHASHed value with the corresponding ciphertext
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- _initial_blocks_done\num_initial_blocks\operation:
- .endm
- /*
- * if a = number of total plaintext bytes
- * b = floor(a/16)
- * num_initial_blocks = b mod 4
- * encrypt the initial num_initial_blocks blocks and apply ghash on
- * the ciphertext
- * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
- * are clobbered
- * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
- */
- .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
- MOVADQ SHUF_MASK(%rip), %xmm14
- mov arg7, %r10 # %r10 = AAD
- mov arg8, %r12 # %r12 = aadLen
- mov %r12, %r11
- pxor %xmm\i, %xmm\i
- _get_AAD_loop\num_initial_blocks\operation:
- movd (%r10), \TMP1
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
- add $4, %r10
- sub $4, %r12
- jne _get_AAD_loop\num_initial_blocks\operation
- cmp $16, %r11
- je _get_AAD_loop2_done\num_initial_blocks\operation
- mov $16, %r12
- _get_AAD_loop2\num_initial_blocks\operation:
- psrldq $4, %xmm\i
- sub $4, %r12
- cmp %r11, %r12
- jne _get_AAD_loop2\num_initial_blocks\operation
- _get_AAD_loop2_done\num_initial_blocks\operation:
- PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
- xor %r11, %r11 # initialise the data pointer offset as zero
- # start AES for num_initial_blocks blocks
- mov %arg5, %rax # %rax = *Y0
- movdqu (%rax), \XMM0 # XMM0 = Y0
- PSHUFB_XMM %xmm14, \XMM0
- .if (\i == 5) || (\i == 6) || (\i == 7)
- MOVADQ ONE(%RIP),\TMP1
- MOVADQ 0(%arg1),\TMP2
- .irpc index, \i_seq
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, %xmm\index
- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
- pxor \TMP2, %xmm\index
- .endr
- lea 0x10(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- aes_loop_initial_enc\num_initial_blocks:
- MOVADQ (%r10),\TMP1
- .irpc index, \i_seq
- AESENC \TMP1, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_initial_enc\num_initial_blocks
- MOVADQ (%r10), \TMP1
- .irpc index, \i_seq
- AESENCLAST \TMP1, %xmm\index # Last Round
- .endr
- .irpc index, \i_seq
- movdqu (%arg3 , %r11, 1), \TMP1
- pxor \TMP1, %xmm\index
- movdqu %xmm\index, (%arg2 , %r11, 1)
- # write back plaintext/ciphertext for num_initial_blocks
- add $16, %r11
- PSHUFB_XMM %xmm14, %xmm\index
- # prepare plaintext/ciphertext for GHASH computation
- .endr
- .endif
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- # apply GHASH on num_initial_blocks blocks
- .if \i == 5
- pxor %xmm5, %xmm6
- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .elseif \i == 6
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .elseif \i == 7
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- .endif
- cmp $64, %r13
- jl _initial_blocks_done\num_initial_blocks\operation
- # no need for precomputed values
- /*
- *
- * Precomputations for HashKey parallel with encryption of first 4 blocks.
- * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- */
- MOVADQ ONE(%RIP),\TMP1
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM1
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM2
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM3
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM4
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- MOVADQ 0(%arg1),\TMP1
- pxor \TMP1, \XMM1
- pxor \TMP1, \XMM2
- pxor \TMP1, \XMM3
- pxor \TMP1, \XMM4
- movdqa \TMP3, \TMP5
- pshufd $78, \TMP3, \TMP1
- pxor \TMP3, \TMP1
- movdqa \TMP1, HashKey_k(%rsp)
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^2<<1 (mod poly)
- movdqa \TMP5, HashKey_2(%rsp)
- # HashKey_2 = HashKey^2<<1 (mod poly)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_2_k(%rsp)
- .irpc index, 1234 # do 4 rounds
- movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- .endr
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_3(%rsp)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_3_k(%rsp)
- .irpc index, 56789 # do next 5 rounds
- movaps 0x10*\index(%arg1), \TMP1
- AESENC \TMP1, \XMM1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- .endr
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
- # TMP5 = HashKey^3<<1 (mod poly)
- movdqa \TMP5, HashKey_4(%rsp)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqa \TMP1, HashKey_4_k(%rsp)
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_pre_enc_done\num_initial_blocks
- aes_loop_pre_enc\num_initial_blocks:
- MOVADQ (%r10),\TMP2
- .irpc index, 1234
- AESENC \TMP2, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_pre_enc\num_initial_blocks
- aes_loop_pre_enc_done\num_initial_blocks:
- MOVADQ (%r10), \TMP2
- AESENCLAST \TMP2, \XMM1
- AESENCLAST \TMP2, \XMM2
- AESENCLAST \TMP2, \XMM3
- AESENCLAST \TMP2, \XMM4
- movdqu 16*0(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM1
- movdqu 16*1(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM2
- movdqu 16*2(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM3
- movdqu 16*3(%arg3 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM4
- movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
- movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
- movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
- movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
- add $64, %r11
- PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- pxor \XMMDst, \XMM1
- # combine GHASHed value with the corresponding ciphertext
- PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- _initial_blocks_done\num_initial_blocks\operation:
- .endm
- /*
- * encrypt 4 blocks at a time
- * ghash the 4 previously encrypted ciphertext blocks
- * arg1, %arg2, %arg3 are used as pointers only, not modified
- * %r11 is the data offset value
- */
- .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
- TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa HashKey_4(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqa HashKey_4_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqa HashKey_3(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- movdqa HashKey_3_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- pxor \TMP1, \TMP4
- # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqa HashKey_2(%rsp ), \TMP5
- # Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- movdqa HashKey_2_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- pxor \TMP1, \TMP4
- # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqa HashKey(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_enc_done
- aes_loop_par_enc:
- MOVADQ (%r10),\TMP3
- .irpc index, 1234
- AESENC \TMP3, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_par_enc
- aes_loop_par_enc_done:
- MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # Round 10
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
- movdqa HashKey_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu 16(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu 32(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu 48(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
- # first phase of reduction
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
- # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
- # second phase of reduction
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
- pxor \XMM5, \XMM1
- .endm
- /*
- * decrypt 4 blocks at a time
- * ghash the 4 previously decrypted ciphertext blocks
- * arg1, %arg2, %arg3 are used as pointers only, not modified
- * %r11 is the data offset value
- */
- .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
- TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa HashKey_4(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqa HashKey_4_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 1
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, \XMM1 # Round 2
- AESENC \TMP1, \XMM2
- AESENC \TMP1, \XMM3
- AESENC \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqa HashKey_3(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 3
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 4
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- movdqa HashKey_3_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 5
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- pxor \TMP1, \TMP4
- # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqa HashKey_2(%rsp ), \TMP5
- # Multiply TMP5 * HashKey using karatsuba
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 6
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 7
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- movdqa HashKey_2_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 8
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- pxor \TMP1, \TMP4
- # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqa HashKey(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- AESENC \TMP3, \XMM1 # Round 9
- AESENC \TMP3, \XMM2
- AESENC \TMP3, \XMM3
- AESENC \TMP3, \XMM4
- PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_dec_done
- aes_loop_par_dec:
- MOVADQ (%r10),\TMP3
- .irpc index, 1234
- AESENC \TMP3, %xmm\index
- .endr
- add $16,%r10
- sub $1,%eax
- jnz aes_loop_par_dec
- aes_loop_par_dec_done:
- MOVADQ (%r10), \TMP3
- AESENCLAST \TMP3, \XMM1 # last round
- AESENCLAST \TMP3, \XMM2
- AESENCLAST \TMP3, \XMM3
- AESENCLAST \TMP3, \XMM4
- movdqa HashKey_k(%rsp), \TMP5
- PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM1
- movdqu 16(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM2
- movdqu 32(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM3
- movdqu 48(%arg3,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM4
- PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
- PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
- # first phase of reduction
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
- # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
- # second phase of reduction
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
- pxor \XMM5, \XMM1
- .endm
- /* GHASH the last 4 ciphertext blocks. */
- .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
- TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
- # Multiply TMP6 * HashKey (using Karatsuba)
- movdqa \XMM1, \TMP6
- pshufd $78, \XMM1, \TMP2
- pxor \XMM1, \TMP2
- movdqa HashKey_4(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
- movdqa HashKey_4_k(%rsp), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqa \XMM1, \XMMDst
- movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM2, \TMP1
- pshufd $78, \XMM2, \TMP2
- pxor \XMM2, \TMP2
- movdqa HashKey_3(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
- movdqa HashKey_3_k(%rsp), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM2, \XMMDst
- pxor \TMP2, \XMM1
- # results accumulated in TMP6, XMMDst, XMM1
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM3, \TMP1
- pshufd $78, \XMM3, \TMP2
- pxor \XMM3, \TMP2
- movdqa HashKey_2(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
- movdqa HashKey_2_k(%rsp), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM3, \XMMDst
- pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM4, \TMP1
- pshufd $78, \XMM4, \TMP2
- pxor \XMM4, \TMP2
- movdqa HashKey(%rsp), \TMP5
- PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
- movdqa HashKey_k(%rsp), \TMP4
- PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM4, \XMMDst
- pxor \XMM1, \TMP2
- pxor \TMP6, \TMP2
- pxor \XMMDst, \TMP2
- # middle section of the temp results combined as in karatsuba algorithm
- movdqa \TMP2, \TMP4
- pslldq $8, \TMP4 # left shift TMP4 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP4, \XMMDst
- pxor \TMP2, \TMP6
- # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
- # first phase of the reduction
- movdqa \XMMDst, \TMP2
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
- # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
- pslld $31, \TMP2 # packed right shifting << 31
- pslld $30, \TMP3 # packed right shifting << 30
- pslld $25, \TMP4 # packed right shifting << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP7
- psrldq $4, \TMP7 # right shift TMP7 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \XMMDst
- # second phase of the reduction
- movdqa \XMMDst, \TMP2
- # make 3 copies of XMMDst for doing 3 shift operations
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
- psrld $1, \TMP2 # packed left shift >> 1
- psrld $2, \TMP3 # packed left shift >> 2
- psrld $7, \TMP4 # packed left shift >> 7
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- pxor \TMP7, \TMP2
- pxor \TMP2, \XMMDst
- pxor \TMP6, \XMMDst # reduced result is in XMMDst
- .endm
- /* Encryption of a single block
- * uses eax & r10
- */
- .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
- pxor (%arg1), \XMM0
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- lea 16(%arg1), %r10 # get first expanded key address
- _esb_loop_\@:
- MOVADQ (%r10),\TMP1
- AESENC \TMP1,\XMM0
- add $16,%r10
- sub $1,%eax
- jnz _esb_loop_\@
- MOVADQ (%r10),\TMP1
- AESENCLAST \TMP1,\XMM0
- .endm
- /*****************************************************************************
- * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
- * u8 *out, // Plaintext output. Encrypt in-place is allowed.
- * const u8 *in, // Ciphertext input
- * u64 plaintext_len, // Length of data in bytes for decryption.
- * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
- * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
- * // concatenated with 0x00000001. 16-byte aligned pointer.
- * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, // Additional Authentication Data (AAD)
- * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
- * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
- * // given authentication tag and only return the plaintext if they match.
- * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
- * // (most likely), 12 or 8.
- *
- * Assumptions:
- *
- * keys:
- * keys are pre-expanded and aligned to 16 bytes. we are using the first
- * set of 11 keys in the data structure void *aes_ctx
- *
- * iv:
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | Salt (From the SA) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | Initialization Vector |
- * | (This is the sequence number from IPSec header) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x1 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- *
- *
- * AAD:
- * AAD padded to 128 bits with 0
- * for example, assume AAD is a u32 vector
- *
- * if AAD is 8 bytes:
- * AAD[3] = {A0, A1};
- * padded AAD in xmm register = {A1 A0 0 0}
- *
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | SPI (A1) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 32-bit Sequence Number (A0) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x0 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * AAD Format with 32-bit Sequence Number
- *
- * if AAD is 12 bytes:
- * AAD[3] = {A0, A1, A2};
- * padded AAD in xmm register = {A2 A1 A0 0}
- *
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | SPI (A2) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 64-bit Extended Sequence Number {A1,A0} |
- * | |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x0 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * AAD Format with 64-bit Extended Sequence Number
- *
- * aadLen:
- * from the definition of the spec, aadLen can only be 8 or 12 bytes.
- * The code supports 16 too but for other sizes, the code will fail.
- *
- * TLen:
- * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
- * For other sizes, the code will fail.
- *
- * poly = x^128 + x^127 + x^126 + x^121 + 1
- *
- *****************************************************************************/
- ENTRY(aesni_gcm_dec)
- push %r12
- push %r13
- push %r14
- mov %rsp, %r14
- /*
- * states of %xmm registers %xmm6:%xmm15 not saved
- * all %xmm registers are clobbered
- */
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
- mov %arg6, %r12
- movdqu (%r12), %xmm13 # %xmm13 = HashKey
- movdqa SHUF_MASK(%rip), %xmm2
- PSHUFB_XMM %xmm2, %xmm13
- # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
- movdqa %xmm13, %xmm2
- psllq $1, %xmm13
- psrlq $63, %xmm2
- movdqa %xmm2, %xmm1
- pslldq $8, %xmm2
- psrldq $8, %xmm1
- por %xmm2, %xmm13
- # Reduction
- pshufd $0x24, %xmm1, %xmm2
- pcmpeqd TWOONE(%rip), %xmm2
- pand POLY(%rip), %xmm2
- pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
- # Decrypt first few blocks
- movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
- mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
- mov %r13, %r12
- and $(3<<4), %r12
- jz _initial_num_blocks_is_0_decrypt
- cmp $(2<<4), %r12
- jb _initial_num_blocks_is_1_decrypt
- je _initial_num_blocks_is_2_decrypt
- _initial_num_blocks_is_3_decrypt:
- INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
- sub $48, %r13
- jmp _initial_blocks_decrypted
- _initial_num_blocks_is_2_decrypt:
- INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
- sub $32, %r13
- jmp _initial_blocks_decrypted
- _initial_num_blocks_is_1_decrypt:
- INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
- sub $16, %r13
- jmp _initial_blocks_decrypted
- _initial_num_blocks_is_0_decrypt:
- INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
- _initial_blocks_decrypted:
- cmp $0, %r13
- je _zero_cipher_left_decrypt
- sub $64, %r13
- je _four_cipher_left_decrypt
- _decrypt_by_4:
- GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
- %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
- add $64, %r11
- sub $64, %r13
- jne _decrypt_by_4
- _four_cipher_left_decrypt:
- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
- %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
- _zero_cipher_left_decrypt:
- mov %arg4, %r13
- and $15, %r13 # %r13 = arg4 (mod 16)
- je _multiple_of_16_bytes_decrypt
- # Handle the last <16 byte block separately
- paddd ONE(%rip), %xmm0 # increment CNT to get Yn
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12
- # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
- # (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
- movdqa %xmm1, %xmm2
- pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
- pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
- pand %xmm1, %xmm2
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10 ,%xmm2
- pxor %xmm2, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
- # output %r13 bytes
- MOVQ_R64_XMM %xmm0, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left_decrypt
- mov %rax, (%arg2 , %r11, 1)
- add $8, %r11
- psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
- sub $8, %r13
- _less_than_8_bytes_left_decrypt:
- mov %al, (%arg2, %r11, 1)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left_decrypt
- _multiple_of_16_bytes_decrypt:
- mov arg8, %r12 # %r13 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- movd %r12d, %xmm15 # len(A) in %xmm15
- shl $3, %arg4 # len(C) in bits (*128)
- MOVQ_R64_XMM %arg4, %xmm1
- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
- pxor %xmm15, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # final GHASH computation
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm8
- mov %arg5, %rax # %rax = *Y0
- movdqu (%rax), %xmm0 # %xmm0 = Y0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
- pxor %xmm8, %xmm0
- _return_T_decrypt:
- mov arg9, %r10 # %r10 = authTag
- mov arg10, %r11 # %r11 = auth_tag_len
- cmp $16, %r11
- je _T_16_decrypt
- cmp $12, %r11
- je _T_12_decrypt
- _T_8_decrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
- jmp _return_T_done_decrypt
- _T_12_decrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
- psrldq $8, %xmm0
- movd %xmm0, %eax
- mov %eax, 8(%r10)
- jmp _return_T_done_decrypt
- _T_16_decrypt:
- movdqu %xmm0, (%r10)
- _return_T_done_decrypt:
- mov %r14, %rsp
- pop %r14
- pop %r13
- pop %r12
- ret
- ENDPROC(aesni_gcm_dec)
- /*****************************************************************************
- * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
- * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
- * const u8 *in, // Plaintext input
- * u64 plaintext_len, // Length of data in bytes for encryption.
- * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
- * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
- * // concatenated with 0x00000001. 16-byte aligned pointer.
- * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, // Additional Authentication Data (AAD)
- * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
- * u8 *auth_tag, // Authenticated Tag output.
- * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
- * // 12 or 8.
- *
- * Assumptions:
- *
- * keys:
- * keys are pre-expanded and aligned to 16 bytes. we are using the
- * first set of 11 keys in the data structure void *aes_ctx
- *
- *
- * iv:
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | Salt (From the SA) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | Initialization Vector |
- * | (This is the sequence number from IPSec header) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x1 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- *
- *
- * AAD:
- * AAD padded to 128 bits with 0
- * for example, assume AAD is a u32 vector
- *
- * if AAD is 8 bytes:
- * AAD[3] = {A0, A1};
- * padded AAD in xmm register = {A1 A0 0 0}
- *
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | SPI (A1) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 32-bit Sequence Number (A0) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x0 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * AAD Format with 32-bit Sequence Number
- *
- * if AAD is 12 bytes:
- * AAD[3] = {A0, A1, A2};
- * padded AAD in xmm register = {A2 A1 A0 0}
- *
- * 0 1 2 3
- * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | SPI (A2) |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 64-bit Extended Sequence Number {A1,A0} |
- * | |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | 0x0 |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- * AAD Format with 64-bit Extended Sequence Number
- *
- * aadLen:
- * from the definition of the spec, aadLen can only be 8 or 12 bytes.
- * The code supports 16 too but for other sizes, the code will fail.
- *
- * TLen:
- * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
- * For other sizes, the code will fail.
- *
- * poly = x^128 + x^127 + x^126 + x^121 + 1
- ***************************************************************************/
- ENTRY(aesni_gcm_enc)
- push %r12
- push %r13
- push %r14
- mov %rsp, %r14
- #
- # states of %xmm registers %xmm6:%xmm15 not saved
- # all %xmm registers are clobbered
- #
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp
- mov %arg6, %r12
- movdqu (%r12), %xmm13
- movdqa SHUF_MASK(%rip), %xmm2
- PSHUFB_XMM %xmm2, %xmm13
- # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
- movdqa %xmm13, %xmm2
- psllq $1, %xmm13
- psrlq $63, %xmm2
- movdqa %xmm2, %xmm1
- pslldq $8, %xmm2
- psrldq $8, %xmm1
- por %xmm2, %xmm13
- # reduce HashKey<<1
- pshufd $0x24, %xmm1, %xmm2
- pcmpeqd TWOONE(%rip), %xmm2
- pand POLY(%rip), %xmm2
- pxor %xmm2, %xmm13
- movdqa %xmm13, HashKey(%rsp)
- mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
- and $-16, %r13
- mov %r13, %r12
- # Encrypt first few blocks
- and $(3<<4), %r12
- jz _initial_num_blocks_is_0_encrypt
- cmp $(2<<4), %r12
- jb _initial_num_blocks_is_1_encrypt
- je _initial_num_blocks_is_2_encrypt
- _initial_num_blocks_is_3_encrypt:
- INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
- sub $48, %r13
- jmp _initial_blocks_encrypted
- _initial_num_blocks_is_2_encrypt:
- INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
- sub $32, %r13
- jmp _initial_blocks_encrypted
- _initial_num_blocks_is_1_encrypt:
- INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
- sub $16, %r13
- jmp _initial_blocks_encrypted
- _initial_num_blocks_is_0_encrypt:
- INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
- %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
- _initial_blocks_encrypted:
- # Main loop - Encrypt remaining blocks
- cmp $0, %r13
- je _zero_cipher_left_encrypt
- sub $64, %r13
- je _four_cipher_left_encrypt
- _encrypt_by_4_encrypt:
- GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
- %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
- add $64, %r11
- sub $64, %r13
- jne _encrypt_by_4_encrypt
- _four_cipher_left_encrypt:
- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
- %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
- _zero_cipher_left_encrypt:
- mov %arg4, %r13
- and $15, %r13 # %r13 = arg4 (mod 16)
- je _multiple_of_16_bytes_encrypt
- # Handle the last <16 Byte block separately
- paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
- pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out top 16-r13 bytes of xmm0
- pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10,%xmm0
- pxor %xmm0, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm0
- # shuffle xmm0 back to output as ciphertext
- # Output %r13 bytes
- MOVQ_R64_XMM %xmm0, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left_encrypt
- mov %rax, (%arg2 , %r11, 1)
- add $8, %r11
- psrldq $8, %xmm0
- MOVQ_R64_XMM %xmm0, %rax
- sub $8, %r13
- _less_than_8_bytes_left_encrypt:
- mov %al, (%arg2, %r11, 1)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left_encrypt
- _multiple_of_16_bytes_encrypt:
- mov arg8, %r12 # %r12 = addLen (number of bytes)
- shl $3, %r12
- movd %r12d, %xmm15 # len(A) in %xmm15
- shl $3, %arg4 # len(C) in bits (*128)
- MOVQ_R64_XMM %arg4, %xmm1
- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
- pxor %xmm15, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # final GHASH computation
- movdqa SHUF_MASK(%rip), %xmm10
- PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
- mov %arg5, %rax # %rax = *Y0
- movdqu (%rax), %xmm0 # %xmm0 = Y0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
- pxor %xmm8, %xmm0
- _return_T_encrypt:
- mov arg9, %r10 # %r10 = authTag
- mov arg10, %r11 # %r11 = auth_tag_len
- cmp $16, %r11
- je _T_16_encrypt
- cmp $12, %r11
- je _T_12_encrypt
- _T_8_encrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
- jmp _return_T_done_encrypt
- _T_12_encrypt:
- MOVQ_R64_XMM %xmm0, %rax
- mov %rax, (%r10)
- psrldq $8, %xmm0
- movd %xmm0, %eax
- mov %eax, 8(%r10)
- jmp _return_T_done_encrypt
- _T_16_encrypt:
- movdqu %xmm0, (%r10)
- _return_T_done_encrypt:
- mov %r14, %rsp
- pop %r14
- pop %r13
- pop %r12
- ret
- ENDPROC(aesni_gcm_enc)
- #endif
- .align 4
- _key_expansion_128:
- _key_expansion_256a:
- pshufd $0b11111111, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
- movaps %xmm0, (TKEYP)
- add $0x10, TKEYP
- ret
- ENDPROC(_key_expansion_128)
- ENDPROC(_key_expansion_256a)
- .align 4
- _key_expansion_192a:
- pshufd $0b01010101, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
- movaps %xmm2, %xmm5
- movaps %xmm2, %xmm6
- pslldq $4, %xmm5
- pshufd $0b11111111, %xmm0, %xmm3
- pxor %xmm3, %xmm2
- pxor %xmm5, %xmm2
- movaps %xmm0, %xmm1
- shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (TKEYP)
- shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 0x10(TKEYP)
- add $0x20, TKEYP
- ret
- ENDPROC(_key_expansion_192a)
- .align 4
- _key_expansion_192b:
- pshufd $0b01010101, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
- movaps %xmm2, %xmm5
- pslldq $4, %xmm5
- pshufd $0b11111111, %xmm0, %xmm3
- pxor %xmm3, %xmm2
- pxor %xmm5, %xmm2
- movaps %xmm0, (TKEYP)
- add $0x10, TKEYP
- ret
- ENDPROC(_key_expansion_192b)
- .align 4
- _key_expansion_256b:
- pshufd $0b10101010, %xmm1, %xmm1
- shufps $0b00010000, %xmm2, %xmm4
- pxor %xmm4, %xmm2
- shufps $0b10001100, %xmm2, %xmm4
- pxor %xmm4, %xmm2
- pxor %xmm1, %xmm2
- movaps %xmm2, (TKEYP)
- add $0x10, TKEYP
- ret
- ENDPROC(_key_expansion_256b)
- /*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
- */
- ENTRY(aesni_set_key)
- #ifndef __x86_64__
- pushl KEYP
- movl 8(%esp), KEYP # ctx
- movl 12(%esp), UKEYP # in_key
- movl 16(%esp), %edx # key_len
- #endif
- movups (UKEYP), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (KEYP)
- lea 0x10(KEYP), TKEYP # key addr
- movl %edx, 480(KEYP)
- pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
- cmp $24, %dl
- jb .Lenc_key128
- je .Lenc_key192
- movups 0x10(UKEYP), %xmm2 # other user key
- movaps %xmm2, (TKEYP)
- add $0x10, TKEYP
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
- call _key_expansion_256a
- AESKEYGENASSIST 0x1 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
- call _key_expansion_256a
- AESKEYGENASSIST 0x2 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
- call _key_expansion_256a
- AESKEYGENASSIST 0x4 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
- call _key_expansion_256a
- AESKEYGENASSIST 0x8 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
- call _key_expansion_256a
- AESKEYGENASSIST 0x10 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
- call _key_expansion_256a
- AESKEYGENASSIST 0x20 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
- call _key_expansion_256a
- jmp .Ldec_key
- .Lenc_key192:
- movq 0x10(UKEYP), %xmm2 # other user key
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
- call _key_expansion_192a
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
- call _key_expansion_192b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
- call _key_expansion_192a
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
- call _key_expansion_192b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
- call _key_expansion_192a
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
- call _key_expansion_192b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
- call _key_expansion_192a
- AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
- call _key_expansion_192b
- jmp .Ldec_key
- .Lenc_key128:
- AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
- call _key_expansion_128
- AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
- call _key_expansion_128
- AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
- call _key_expansion_128
- AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
- call _key_expansion_128
- AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
- call _key_expansion_128
- AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
- call _key_expansion_128
- AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
- call _key_expansion_128
- AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
- call _key_expansion_128
- AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
- call _key_expansion_128
- AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
- call _key_expansion_128
- .Ldec_key:
- sub $0x10, TKEYP
- movaps (KEYP), %xmm0
- movaps (TKEYP), %xmm1
- movaps %xmm0, 240(TKEYP)
- movaps %xmm1, 240(KEYP)
- add $0x10, KEYP
- lea 240-16(TKEYP), UKEYP
- .align 4
- .Ldec_key_loop:
- movaps (KEYP), %xmm0
- AESIMC %xmm0 %xmm1
- movaps %xmm1, (UKEYP)
- add $0x10, KEYP
- sub $0x10, UKEYP
- cmp TKEYP, KEYP
- jb .Ldec_key_loop
- xor AREG, AREG
- #ifndef __x86_64__
- popl KEYP
- #endif
- ret
- ENDPROC(aesni_set_key)
- /*
- * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
- ENTRY(aesni_enc)
- #ifndef __x86_64__
- pushl KEYP
- pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
- #endif
- movl 480(KEYP), KLEN # key length
- movups (INP), STATE # input
- call _aesni_enc1
- movups STATE, (OUTP) # output
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- #endif
- ret
- ENDPROC(aesni_enc)
- /*
- * _aesni_enc1: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: round count
- * STATE: initial state (input)
- * output:
- * STATE: finial state (output)
- * changed:
- * KEY
- * TKEYP (T1)
- */
- .align 4
- _aesni_enc1:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE # round 0
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .Lenc128
- lea 0x20(TKEYP), TKEYP
- je .Lenc192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x50(TKEYP), KEY
- AESENC KEY STATE
- .align 4
- .Lenc192:
- movaps -0x40(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x30(TKEYP), KEY
- AESENC KEY STATE
- .align 4
- .Lenc128:
- movaps -0x20(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x10(TKEYP), KEY
- AESENC KEY STATE
- movaps (TKEYP), KEY
- AESENC KEY STATE
- movaps 0x10(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x20(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x30(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x40(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x50(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x60(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE
- ret
- ENDPROC(_aesni_enc1)
- /*
- * _aesni_enc4: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: round count
- * STATE1: initial state (input)
- * STATE2
- * STATE3
- * STATE4
- * output:
- * STATE1: finial state (output)
- * STATE2
- * STATE3
- * STATE4
- * changed:
- * KEY
- * TKEYP (T1)
- */
- .align 4
- _aesni_enc4:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE1 # round 0
- pxor KEY, STATE2
- pxor KEY, STATE3
- pxor KEY, STATE4
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .L4enc128
- lea 0x20(TKEYP), TKEYP
- je .L4enc192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- #.align 4
- .L4enc192:
- movaps -0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- #.align 4
- .L4enc128:
- movaps -0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps (TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE1 # last round
- AESENCLAST KEY STATE2
- AESENCLAST KEY STATE3
- AESENCLAST KEY STATE4
- ret
- ENDPROC(_aesni_enc4)
- /*
- * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
- ENTRY(aesni_dec)
- #ifndef __x86_64__
- pushl KEYP
- pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
- #endif
- mov 480(KEYP), KLEN # key length
- add $240, KEYP
- movups (INP), STATE # input
- call _aesni_dec1
- movups STATE, (OUTP) #output
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- #endif
- ret
- ENDPROC(aesni_dec)
- /*
- * _aesni_dec1: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: key length
- * STATE: initial state (input)
- * output:
- * STATE: finial state (output)
- * changed:
- * KEY
- * TKEYP (T1)
- */
- .align 4
- _aesni_dec1:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE # round 0
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .Ldec128
- lea 0x20(TKEYP), TKEYP
- je .Ldec192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE
- .align 4
- .Ldec192:
- movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE
- .align 4
- .Ldec128:
- movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE
- movaps (TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE
- ret
- ENDPROC(_aesni_dec1)
- /*
- * _aesni_dec4: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: key length
- * STATE1: initial state (input)
- * STATE2
- * STATE3
- * STATE4
- * output:
- * STATE1: finial state (output)
- * STATE2
- * STATE3
- * STATE4
- * changed:
- * KEY
- * TKEYP (T1)
- */
- .align 4
- _aesni_dec4:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE1 # round 0
- pxor KEY, STATE2
- pxor KEY, STATE3
- pxor KEY, STATE4
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .L4dec128
- lea 0x20(TKEYP), TKEYP
- je .L4dec192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- .align 4
- .L4dec192:
- movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- .align 4
- .L4dec128:
- movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps (TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE1 # last round
- AESDECLAST KEY STATE2
- AESDECLAST KEY STATE3
- AESDECLAST KEY STATE4
- ret
- ENDPROC(_aesni_dec4)
- /*
- * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len)
- */
- ENTRY(aesni_ecb_enc)
- #ifndef __x86_64__
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
- #endif
- test LEN, LEN # check length
- jz .Lecb_enc_ret
- mov 480(KEYP), KLEN
- cmp $16, LEN
- jb .Lecb_enc_ret
- cmp $64, LEN
- jb .Lecb_enc_loop1
- .align 4
- .Lecb_enc_loop4:
- movups (INP), STATE1
- movups 0x10(INP), STATE2
- movups 0x20(INP), STATE3
- movups 0x30(INP), STATE4
- call _aesni_enc4
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lecb_enc_loop4
- cmp $16, LEN
- jb .Lecb_enc_ret
- .align 4
- .Lecb_enc_loop1:
- movups (INP), STATE1
- call _aesni_enc1
- movups STATE1, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lecb_enc_loop1
- .Lecb_enc_ret:
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- #endif
- ret
- ENDPROC(aesni_ecb_enc)
- /*
- * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len);
- */
- ENTRY(aesni_ecb_dec)
- #ifndef __x86_64__
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
- #endif
- test LEN, LEN
- jz .Lecb_dec_ret
- mov 480(KEYP), KLEN
- add $240, KEYP
- cmp $16, LEN
- jb .Lecb_dec_ret
- cmp $64, LEN
- jb .Lecb_dec_loop1
- .align 4
- .Lecb_dec_loop4:
- movups (INP), STATE1
- movups 0x10(INP), STATE2
- movups 0x20(INP), STATE3
- movups 0x30(INP), STATE4
- call _aesni_dec4
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lecb_dec_loop4
- cmp $16, LEN
- jb .Lecb_dec_ret
- .align 4
- .Lecb_dec_loop1:
- movups (INP), STATE1
- call _aesni_dec1
- movups STATE1, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lecb_dec_loop1
- .Lecb_dec_ret:
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- #endif
- ret
- ENDPROC(aesni_ecb_dec)
- /*
- * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
- ENTRY(aesni_cbc_enc)
- #ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
- #endif
- cmp $16, LEN
- jb .Lcbc_enc_ret
- mov 480(KEYP), KLEN
- movups (IVP), STATE # load iv as initial state
- .align 4
- .Lcbc_enc_loop:
- movups (INP), IN # load input
- pxor IN, STATE
- call _aesni_enc1
- movups STATE, (OUTP) # store output
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lcbc_enc_loop
- movups STATE, (IVP)
- .Lcbc_enc_ret:
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
- #endif
- ret
- ENDPROC(aesni_cbc_enc)
- /*
- * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
- ENTRY(aesni_cbc_dec)
- #ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
- #endif
- cmp $16, LEN
- jb .Lcbc_dec_just_ret
- mov 480(KEYP), KLEN
- add $240, KEYP
- movups (IVP), IV
- cmp $64, LEN
- jb .Lcbc_dec_loop1
- .align 4
- .Lcbc_dec_loop4:
- movups (INP), IN1
- movaps IN1, STATE1
- movups 0x10(INP), IN2
- movaps IN2, STATE2
- #ifdef __x86_64__
- movups 0x20(INP), IN3
- movaps IN3, STATE3
- movups 0x30(INP), IN4
- movaps IN4, STATE4
- #else
- movups 0x20(INP), IN1
- movaps IN1, STATE3
- movups 0x30(INP), IN2
- movaps IN2, STATE4
- #endif
- call _aesni_dec4
- pxor IV, STATE1
- #ifdef __x86_64__
- pxor IN1, STATE2
- pxor IN2, STATE3
- pxor IN3, STATE4
- movaps IN4, IV
- #else
- pxor IN1, STATE4
- movaps IN2, IV
- movups (INP), IN1
- pxor IN1, STATE2
- movups 0x10(INP), IN2
- pxor IN2, STATE3
- #endif
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lcbc_dec_loop4
- cmp $16, LEN
- jb .Lcbc_dec_ret
- .align 4
- .Lcbc_dec_loop1:
- movups (INP), IN
- movaps IN, STATE
- call _aesni_dec1
- pxor IV, STATE
- movups STATE, (OUTP)
- movaps IN, IV
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lcbc_dec_loop1
- .Lcbc_dec_ret:
- movups IV, (IVP)
- .Lcbc_dec_just_ret:
- #ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
- #endif
- ret
- ENDPROC(aesni_cbc_dec)
- #ifdef __x86_64__
- .align 16
- .Lbswap_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
- /*
- * _aesni_inc_init: internal ABI
- * setup registers used by _aesni_inc
- * input:
- * IV
- * output:
- * CTR: == IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- * INC: == 1, in little endian
- * BSWAP_MASK == endian swapping mask
- */
- .align 4
- _aesni_inc_init:
- movaps .Lbswap_mask, BSWAP_MASK
- movaps IV, CTR
- PSHUFB_XMM BSWAP_MASK CTR
- mov $1, TCTR_LOW
- MOVQ_R64_XMM TCTR_LOW INC
- MOVQ_R64_XMM CTR TCTR_LOW
- ret
- ENDPROC(_aesni_inc_init)
- /*
- * _aesni_inc: internal ABI
- * Increase IV by 1, IV is in big endian
- * input:
- * IV
- * CTR: == IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- * INC: == 1, in little endian
- * BSWAP_MASK == endian swapping mask
- * output:
- * IV: Increase by 1
- * changed:
- * CTR: == output IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- */
- .align 4
- _aesni_inc:
- paddq INC, CTR
- add $1, TCTR_LOW
- jnc .Linc_low
- pslldq $8, INC
- paddq INC, CTR
- psrldq $8, INC
- .Linc_low:
- movaps CTR, IV
- PSHUFB_XMM BSWAP_MASK IV
- ret
- ENDPROC(_aesni_inc)
- /*
- * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
- ENTRY(aesni_ctr_enc)
- cmp $16, LEN
- jb .Lctr_enc_just_ret
- mov 480(KEYP), KLEN
- movups (IVP), IV
- call _aesni_inc_init
- cmp $64, LEN
- jb .Lctr_enc_loop1
- .align 4
- .Lctr_enc_loop4:
- movaps IV, STATE1
- call _aesni_inc
- movups (INP), IN1
- movaps IV, STATE2
- call _aesni_inc
- movups 0x10(INP), IN2
- movaps IV, STATE3
- call _aesni_inc
- movups 0x20(INP), IN3
- movaps IV, STATE4
- call _aesni_inc
- movups 0x30(INP), IN4
- call _aesni_enc4
- pxor IN1, STATE1
- movups STATE1, (OUTP)
- pxor IN2, STATE2
- movups STATE2, 0x10(OUTP)
- pxor IN3, STATE3
- movups STATE3, 0x20(OUTP)
- pxor IN4, STATE4
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lctr_enc_loop4
- cmp $16, LEN
- jb .Lctr_enc_ret
- .align 4
- .Lctr_enc_loop1:
- movaps IV, STATE
- call _aesni_inc
- movups (INP), IN
- call _aesni_enc1
- pxor IN, STATE
- movups STATE, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lctr_enc_loop1
- .Lctr_enc_ret:
- movups IV, (IVP)
- .Lctr_enc_just_ret:
- ret
- ENDPROC(aesni_ctr_enc)
- /*
- * _aesni_gf128mul_x_ble: internal ABI
- * Multiply in GF(2^128) for XTS IVs
- * input:
- * IV: current IV
- * GF128MUL_MASK == mask with 0x87 and 0x01
- * output:
- * IV: next IV
- * changed:
- * CTR: == temporary value
- */
- #define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, CTR; \
- paddq IV, IV; \
- psrad $31, CTR; \
- pand GF128MUL_MASK, CTR; \
- pxor CTR, IV;
- /*
- * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * bool enc, u8 *iv)
- */
- ENTRY(aesni_xts_crypt8)
- cmpb $0, %cl
- movl $0, %ecx
- movl $240, %r10d
- leaq _aesni_enc4, %r11
- leaq _aesni_dec4, %rax
- cmovel %r10d, %ecx
- cmoveq %rax, %r11
- movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
- movups (IVP), IV
- mov 480(KEYP), KLEN
- addq %rcx, KEYP
- movdqa IV, STATE1
- movdqu 0x00(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x10(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x20(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x30(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x30(OUTP)
- CALL_NOSPEC %r11
- movdqu 0x00(OUTP), INC
- pxor INC, STATE1
- movdqu STATE1, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE1
- movdqu 0x40(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x40(OUTP)
- movdqu 0x10(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x50(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x50(OUTP)
- movdqu 0x20(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x60(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x60(OUTP)
- movdqu 0x30(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x70(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x70(OUTP)
- _aesni_gf128mul_x_ble()
- movups IV, (IVP)
- CALL_NOSPEC %r11
- movdqu 0x40(OUTP), INC
- pxor INC, STATE1
- movdqu STATE1, 0x40(OUTP)
- movdqu 0x50(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x50(OUTP)
- movdqu 0x60(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x60(OUTP)
- movdqu 0x70(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x70(OUTP)
- ret
- ENDPROC(aesni_xts_crypt8)
- #endif
|